# Generación de datos para entrenar un modelo de Machine Learning


### Importar librerías necesarias

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


### Cargar el inventario de plazas disponibles en la zona SER de Madrid.

In [12]:
df = pd.read_csv('data/calles_SER_2024.csv', sep=';', encoding='latin-1') 


### Análisis Exploratorio de Datos

In [13]:
df.sample(5)

Unnamed: 0,gis_x,gis_y,distrito,barrio,calle,num_finca,color,bateria_linea,num_plazas
11954,44265915,447695987,05 CHAMARTÍN,05-02 PROSPERIDAD,"CANILLAS, CALLE, DE",19,077214010 Verde,Línea,1.0
1973,43888029,447344433,02 ARGANZUELA,02-01 IMPERIAL,"PIZARRA, CALLE, DE LA",3,077214010 Verde,Batería,13.0
30971,44455937,447653777,15 CIUDAD LINEAL,15-04 CONCEPCIÓN,"VIRGEN DEL FRESNEDO, CALLE, DE LA",7,077214010 Verde,Línea,5.0
17492,44069774,447818907,06 TETUÁN,06-02 CUATRO CAMINOS,"DULCINEA, CALLE, DE",67,077214010 Verde,Línea,7.0
19227,44102344,448024045,06 TETUÁN,06-04 ALMENARA,"MONTOYA, CALLE, DE",48,043000255 Azul,Línea,5.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32205 entries, 0 to 32204
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gis_x          32205 non-null  object 
 1   gis_y          32205 non-null  object 
 2   distrito       32205 non-null  object 
 3   barrio         32205 non-null  object 
 4   calle          32205 non-null  object 
 5   num_finca      32205 non-null  object 
 6   color          32205 non-null  object 
 7   bateria_linea  32205 non-null  object 
 8   num_plazas     32204 non-null  float64
dtypes: float64(1), object(8)
memory usage: 2.2+ MB


In [5]:
df.describe(include='object').round().T

Unnamed: 0,count,unique,top,freq
gis_x,32205,31210,44093746,3
gis_y,32205,31659,447914841,3
distrito,32205,12,05 CHAMARTÍN,6060
barrio,32205,60,05-01 EL VISO,1327
calle,32205,2284,"CASTELLANA, PASEO, DE LA",273
num_finca,32205,644,S/N,2401
color,32205,5,077214010 Verde,27984
bateria_linea,32205,2,Línea,24599


### Valores únicos por columna

In [6]:
# Obtener un nuevo dataframe de dos columnas donde en la primera estén las features (features) y en la otra los valores únicos
# asociados (n_values).

unique_values_output = []
for column in df.columns:
    unique_values = df[column].unique().tolist()
    unique_values_output.append([column, unique_values])

unique_values_df = pd.DataFrame(unique_values_output, columns=['Feature', 'Unique Values'])
print(unique_values_df)

         Feature                                      Unique Values
0          gis_x  [439592,91, 439569,07, 439578,18, 439574,49, 4...
1          gis_y  [4473566,23, 4473598,77, 4473498,76, 4473493,4...
2       distrito  [01  CENTRO, 02  ARGANZUELA, 03  RETIRO, 04  S...
3         barrio  [01-01 PALACIO, 01-02 EMBAJADORES, 01-03 CORTE...
4          calle  [AGUAS, CALLE, DE LAS, AGUILA, CALLE, DEL, ALA...
5      num_finca  [2, 8, 3, 5, 12, 17, 21, 1, 6, 7, 4, 9, 19, 14...
6          color  [077214010 Verde, 255140000 Naranja, 043000255...
7  bateria_linea                                   [Línea, Batería]
8     num_plazas  [7.0, 5.0, 1.0, 9.0, 4.0, 3.0, 2.0, 6.0, 13.0,...


In [9]:
print(df['zona_ser'].unique().tolist())

['verde', 'azul']


In [12]:
df.isnull().sum()

gis_x            0
gis_y            0
distrito         0
barrio           0
calle            0
num_finca        0
color            0
bateria_linea    0
num_plazas       1
dtype: int64

### Investigando cuantas plazas hay por zona

In [15]:
df.groupby('color')['num_plazas'].sum()


color
043000255 Azul              21209.0
077214010 Verde            147490.0
081209246 Alta Rotación       374.0
255000000 Rojo                366.0
255140000 Naranja            1511.0
Name: num_plazas, dtype: float64

In [None]:
import pandas as pd
from geopy.distance import geodesic

# Sample dataframe of coordinates (latitude, longitude)
data = {
    'location': ['A', 'B', 'C', 'D'],
    'latitude': [40.7128, 34.0522, 51.5074, 48.8566],
    'longitude': [-74.0060, -118.2437, -0.1276, 2.3522]
}

df = pd.DataFrame(data)

# Define the center coordinate (latitude, longitude) and radius (in kilometers)
center = (40.439410, -3,695900)  # Example: San Francisco, CA
radius = 5000  # in kilometers

# Function to calculate the distance from the center
def is_within_radius(lat, lon, center, radius):
    point = (lat, lon)
    distance = geodesic(center, point).kilometers
    return distance <= radius

# Apply the function to each row in the dataframe
df['within_radius'] = df.apply(lambda row: is_within_radius(row['latitude'], row['longitude'], center, radius), axis=1)

print(df)



## Transformaciones

### Zonas SER

In [14]:
import pandas as pd

# Assuming your data is in a DataFrame called 'df'
# If not, you can create it like this:
# df = pd.read_csv('your_file.csv', sep=';')

df['zona_ser'] = df['color'].str.split(' ').str[1].str.lower()
df = df[df['zona_ser'].isin(['verde', 'azul'])]


### Generar columnas longitude y latitude

In [15]:
import pandas as pd
import utm

# Assuming your data is in a DataFrame called 'df'
# If not, you can create it like this:
# df = pd.read_csv('your_file.csv', sep=';')

# Replace comma with point for gis_x and gis_y columns
df['gis_x'] = df['gis_x'].str.replace(',', '.').astype(float)
df['gis_y'] = df['gis_y'].str.replace(',', '.').astype(float)

# Function to convert UTM to Latitude and Longitude
def utm_to_latlong(row):
  try:
    lat, lon = utm.to_latlon(row['gis_x'], row['gis_y'], 30, 'N') # Assuming UTM zone 30N, adjust if needed
  except:
    lat, lon = None, None # Handle cases where conversion might fail
  return pd.Series({'latitude': lat, 'longitude': lon})

# Apply the function to create new columns
df[['latitude', 'longitude']] = df.apply(utm_to_latlong, axis=1)

print(df.head())


       gis_x       gis_y    distrito         barrio                 calle  \
0  439592.91  4473566.23  01  CENTRO  01-01 PALACIO  AGUAS, CALLE, DE LAS   
1  439569.07  4473598.77  01  CENTRO  01-01 PALACIO  AGUAS, CALLE, DE LAS   
2  439578.18  4473498.76  01  CENTRO  01-01 PALACIO    AGUILA, CALLE, DEL   
3  439574.49  4473493.40  01  CENTRO  01-01 PALACIO    AGUILA, CALLE, DEL   
4  439559.19  4473471.82  01  CENTRO  01-01 PALACIO    AGUILA, CALLE, DEL   

  num_finca            color bateria_linea  num_plazas zona_ser   latitude  \
0         2  077214010 Verde         Línea         7.0    verde  40.410523   
1         8  077214010 Verde         Línea         5.0    verde  40.410815   
2         3  077214010 Verde         Línea         1.0    verde  40.409915   
3         5  077214010 Verde         Línea         1.0    verde  40.409866   
4        12  077214010 Verde         Línea         9.0    verde  40.409671   

   longitude  
0  -3.711956  
1  -3.712240  
2  -3.712123  
3  -3.71

### Extraer columnas útiles del inventario de plazas SER

In [16]:
import pandas as pd

# Assuming your data is in a DataFrame called 'df'
# If not, you can create it like this:
# df = pd.read_csv('your_file.csv', sep=';')

# Select and reorder columns
df = df[['latitude', 'longitude', 'zona_ser', 'num_plazas']]

df.head()


Unnamed: 0,latitude,longitude,zona_ser,num_plazas
0,40.410523,-3.711956,verde,7.0
1,40.410815,-3.71224,verde,5.0
2,40.409915,-3.712123,verde,1.0
3,40.409866,-3.712166,verde,1.0
4,40.409671,-3.712344,verde,9.0


### Añadir la columna hora_del_dia

In [19]:
df.shape

(769032, 5)

In [18]:
import pandas as pd



# Create a list of time intervals
time_intervals = pd.timedelta_range(start='00:00:00', end='23:59:00', freq='60T')

# Repeat each row 48 times (for each time interval)
df = df.loc[df.index.repeat(len(time_intervals))].reset_index(drop=True)

# Add the 'hora_del_dia' column
df['hora_del_dia'] = time_intervals.repeat(len(df) // len(time_intervals))

print(df.head())


    latitude  longitude zona_ser  num_plazas hora_del_dia
0  40.410523  -3.711956    verde         7.0       0 days
1  40.410523  -3.711956    verde         7.0       0 days
2  40.410523  -3.711956    verde         7.0       0 days
3  40.410523  -3.711956    verde         7.0       0 days
4  40.410523  -3.711956    verde         7.0       0 days


  time_intervals = pd.timedelta_range(start='00:00:00', end='23:59:00', freq='60T')


In [21]:
df.head()

Unnamed: 0,latitude,longitude,zona_ser,num_plazas,hora_del_dia
0,40.410523,-3.711956,verde,7.0,0 days
1,40.410523,-3.711956,verde,7.0,0 days
2,40.410523,-3.711956,verde,7.0,0 days
3,40.410523,-3.711956,verde,7.0,0 days
4,40.410523,-3.711956,verde,7.0,0 days


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 769032 entries, 0 to 769031
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype          
---  ------        --------------   -----          
 0   latitude      769032 non-null  float64        
 1   longitude     769032 non-null  float64        
 2   zona_ser      769032 non-null  object         
 3   num_plazas    769008 non-null  float64        
 4   hora_del_dia  769032 non-null  timedelta64[ns]
dtypes: float64(3), object(1), timedelta64[ns](1)
memory usage: 29.3+ MB


In [24]:
df.to_csv('data/dfv1.csv', index=False)

In [23]:
df['hora_del_dia'] = pd.to_datetime(df['hora_del_dia']).dt.strftime('%H:%M')


TypeError: dtype timedelta64[ns] cannot be converted to datetime64[ns]