In [1]:
# Initial imports
# import numpy as np
import pandas as pd
import sklearn.linear_model

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

model_data = (r'C:\Users\clava\data_miners\ML_Model\model_df.csv')
pred_data = (r'C:\Users\clava\data_miners\ML_Model\for_predictions.csv')

In [2]:
def clean_set(data):
  data_df = pd.read_csv(data)

  data_df['ZIP_CODE'] = data_df['ZIP_CODE'].apply(lambda x: f'{x:05}')

  # set ZIP_CODE as index
  data_df = data_df.set_index('ZIP_CODE')

  data_df['EV_TOTAL'] = data_df.iloc[:, -9:-7].sum(axis=1)

  data_df = data_df[['EV_LEVEL_1', 'EV_LEVEL_2', 'EV_FAST', 'EV_TOTAL',
               'NETWORK_TYPE_ChargePoint','NETWORK_TYPE_Non-Networked','NETWORK_TYPE_Other-Network',
               'TOTAL_HOUSEHOLDS','MEDIAN_INCOME','REG_COUNTS']]
  
  return data_df

In [3]:
model_df = clean_set(model_data)
#pred_df = clean_set(pred_data)

In [4]:
model_df

Unnamed: 0_level_0,EV_LEVEL_1,EV_LEVEL_2,EV_FAST,EV_TOTAL,NETWORK_TYPE_ChargePoint,NETWORK_TYPE_Non-Networked,NETWORK_TYPE_Other-Network,TOTAL_HOUSEHOLDS,MEDIAN_INCOME,REG_COUNTS
ZIP_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
03266,0,2,0,2,0,1,0,851.0,54904.0,0
07001,0,1,0,1,0,1,0,5022.0,69849.0,173
07003,0,12,5,12,4,1,3,18577.0,78034.0,795
07004,0,6,3,6,0,1,2,2621.0,122063.0,307
07005,0,1,10,1,0,1,1,5489.0,111946.0,445
...,...,...,...,...,...,...,...,...,...,...
99352,0,13,3,13,1,1,4,12877.0,81410.0,3413
99354,36,48,0,84,0,45,1,9175.0,73369.0,2082
99361,0,1,0,1,0,0,1,750.0,69118.0,100
99362,1,27,0,28,1,3,12,15717.0,56665.0,2054


In [5]:
def target_feature(dataframe):
  y = model_df.REG_COUNTS

  X = model_df[['EV_LEVEL_1', 'EV_LEVEL_2', 'EV_FAST', 'EV_TOTAL',
                    'NETWORK_TYPE_ChargePoint','NETWORK_TYPE_Non-Networked','NETWORK_TYPE_Other-Network',
                    'TOTAL_HOUSEHOLDS',	'MEDIAN_INCOME']]
  
  return (X,y)

In [6]:
model_X, model_y = target_feature(model_df)

#pred_X, pred_y = target_feature(pred_df)
model_X

Unnamed: 0_level_0,EV_LEVEL_1,EV_LEVEL_2,EV_FAST,EV_TOTAL,NETWORK_TYPE_ChargePoint,NETWORK_TYPE_Non-Networked,NETWORK_TYPE_Other-Network,TOTAL_HOUSEHOLDS,MEDIAN_INCOME
ZIP_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
03266,0,2,0,2,0,1,0,851.0,54904.0
07001,0,1,0,1,0,1,0,5022.0,69849.0
07003,0,12,5,12,4,1,3,18577.0,78034.0
07004,0,6,3,6,0,1,2,2621.0,122063.0
07005,0,1,10,1,0,1,1,5489.0,111946.0
...,...,...,...,...,...,...,...,...,...
99352,0,13,3,13,1,1,4,12877.0,81410.0
99354,36,48,0,84,0,45,1,9175.0,73369.0
99361,0,1,0,1,0,0,1,750.0,69118.0
99362,1,27,0,28,1,3,12,15717.0,56665.0


In [7]:
# Split the data into training/testing sets
X_train, X_test, y_train, y_test = train_test_split(model_X, model_y, test_size=0.33, random_state=42)

In [8]:
input_scaler = StandardScaler()
output_scaler = StandardScaler()

X_train_scaled = input_scaler.fit_transform(X_train)
X_test_scaled = input_scaler.transform(X_test)

y_train_scaled = output_scaler.fit_transform(y_train.values.reshape(-1,1))
y_test_scaled = output_scaler.transform(y_test.values.reshape(-1,1))

In [9]:
# Train the model using the training sets
model = LinearRegression()
model.fit(X_train_scaled, y_train_scaled)

LinearRegression()

In [10]:
def predict(in_scaler, out_scaler, orig_df, ml_model, data_X):
  data_scaled = in_scaler.transform(data_X)

  pred_scaled = ml_model.predict(data_scaled)

  pred = out_scaler.inverse_transform(pred_scaled)

  pred_df = pd.DataFrame(pred, index=data_X.index, columns=['REG_PREDICT'])

  pred_out_df = orig_df.merge(pred_df, how='left', on='ZIP_CODE')

  return pred_out_df

In [11]:
model_predictions_df = predict(input_scaler, output_scaler, model_df, model, model_X)

# not working with current predict function
# pred_predictions_df = predict(input_scaler, output_scaler, pred_df, model, pred_X)


In [12]:
# export ev_ml_out_df for Tableau.
model_predictions_df.to_csv('pred_out_df.csv')

In [13]:
model_predictions_df

Unnamed: 0_level_0,EV_LEVEL_1,EV_LEVEL_2,EV_FAST,EV_TOTAL,NETWORK_TYPE_ChargePoint,NETWORK_TYPE_Non-Networked,NETWORK_TYPE_Other-Network,TOTAL_HOUSEHOLDS,MEDIAN_INCOME,REG_COUNTS,REG_PREDICT
ZIP_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
03266,0,2,0,2,0,1,0,851.0,54904.0,0,-319.158586
07001,0,1,0,1,0,1,0,5022.0,69849.0,173,151.347939
07003,0,12,5,12,4,1,3,18577.0,78034.0,795,1671.743466
07004,0,6,3,6,0,1,2,2621.0,122063.0,307,1022.586281
07005,0,1,10,1,0,1,1,5489.0,111946.0,445,967.161030
...,...,...,...,...,...,...,...,...,...,...,...
99352,0,13,3,13,1,1,4,12877.0,81410.0,3413,1183.522265
99354,36,48,0,84,0,45,1,9175.0,73369.0,2082,1734.027563
99361,0,1,0,1,0,0,1,750.0,69118.0,100,60.995554
99362,1,27,0,28,1,3,12,15717.0,56665.0,2054,2121.591955


In [19]:


"""
Feature columns:
0. EV1
1. EV2
2. EV_fast
3. median household income
"""

def generate_predictions(X):
    X_scaled = input_scaler.transform(X)
    y_pred_scaled = model.predict(X_scaled)
    pred = output_scaler.inverse_transform(y_pred_scaled)
    pred_df = pd.DataFrame(pred, index=model_X.index, columns=['REG_PREDICT'])
    return pred_df

# create hypothetical dataset
#X_hypothetical = model_df
# add one EV_fast station
def add_two_fast(x):
    return x + 2
    
pred_2 = model_df[['EV_LEVEL_1', 'EV_LEVEL_2', 'EV_FAST', 'EV_TOTAL',
                    'NETWORK_TYPE_ChargePoint','NETWORK_TYPE_Non-Networked','NETWORK_TYPE_Other-Network',
                    'TOTAL_HOUSEHOLDS',	'MEDIAN_INCOME']]
pred_2['EV_FAST'] = pred_2['EV_FAST'].apply(add_two_fast)

# # add two EV_fast stations and one EV1
# pred_3 = X_hypothetical[:, 0] + 2
# pred_4 = X_hypothetical[:, 2] + 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [20]:
pred_2

Unnamed: 0_level_0,EV_LEVEL_1,EV_LEVEL_2,EV_FAST,EV_TOTAL,NETWORK_TYPE_ChargePoint,NETWORK_TYPE_Non-Networked,NETWORK_TYPE_Other-Network,TOTAL_HOUSEHOLDS,MEDIAN_INCOME
ZIP_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
03266,0,2,2,2,0,1,0,851.0,54904.0
07001,0,1,2,1,0,1,0,5022.0,69849.0
07003,0,12,7,12,4,1,3,18577.0,78034.0
07004,0,6,5,6,0,1,2,2621.0,122063.0
07005,0,1,12,1,0,1,1,5489.0,111946.0
...,...,...,...,...,...,...,...,...,...
99352,0,13,5,13,1,1,4,12877.0,81410.0
99354,36,48,2,84,0,45,1,9175.0,73369.0
99361,0,1,2,1,0,0,1,750.0,69118.0
99362,1,27,2,28,1,3,12,15717.0,56665.0


In [21]:
pred_2_df = generate_predictions(pred_2)

In [22]:
pred_2_df

Unnamed: 0_level_0,REG_PREDICT
ZIP_CODE,Unnamed: 1_level_1
03266,-327.715179
07001,142.791346
07003,1663.186873
07004,1014.029688
07005,958.604437
...,...
99352,1174.965672
99354,1725.470970
99361,52.438961
99362,2113.035362


In [23]:
pred_2_df = pd.DataFrame(pred_2_df)

In [24]:
pred_2_df

Unnamed: 0_level_0,REG_PREDICT
ZIP_CODE,Unnamed: 1_level_1
03266,-327.715179
07001,142.791346
07003,1663.186873
07004,1014.029688
07005,958.604437
...,...
99352,1174.965672
99354,1725.470970
99361,52.438961
99362,2113.035362


In [25]:
def add_two_total(x):
    return x + 2
    
pred_3 = model_df[['EV_LEVEL_1', 'EV_LEVEL_2', 'EV_FAST', 'EV_TOTAL',
                    'NETWORK_TYPE_ChargePoint','NETWORK_TYPE_Non-Networked','NETWORK_TYPE_Other-Network',
                    'TOTAL_HOUSEHOLDS',	'MEDIAN_INCOME']]
pred_3['EV_TOTAL'] = pred_3['EV_TOTAL'].apply(add_two_total)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [32]:
pred_3

Unnamed: 0_level_0,EV_LEVEL_1,EV_LEVEL_2,EV_FAST,EV_TOTAL,NETWORK_TYPE_ChargePoint,NETWORK_TYPE_Non-Networked,NETWORK_TYPE_Other-Network,TOTAL_HOUSEHOLDS,MEDIAN_INCOME
ZIP_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
03266,0,2,0,4,0,1,0,851.0,54904.0
07001,0,1,0,3,0,1,0,5022.0,69849.0
07003,0,12,5,14,4,1,3,18577.0,78034.0
07004,0,6,3,8,0,1,2,2621.0,122063.0
07005,0,1,10,3,0,1,1,5489.0,111946.0
...,...,...,...,...,...,...,...,...,...
99352,0,13,3,15,1,1,4,12877.0,81410.0
99354,36,48,0,86,0,45,1,9175.0,73369.0
99361,0,1,0,3,0,0,1,750.0,69118.0
99362,1,27,0,30,1,3,12,15717.0,56665.0


In [27]:
pred_3_df = generate_predictions(pred_3)

In [30]:
pred_3_df = pd.DataFrame(pred_3_df)

In [31]:
pred_3_df

Unnamed: 0_level_0,REG_PREDICT
ZIP_CODE,Unnamed: 1_level_1
03266,-353.889602
07001,116.616923
07003,1637.012450
07004,987.855264
07005,932.430014
...,...
99352,1148.791248
99354,1699.296547
99361,26.264538
99362,2086.860939


In [46]:
def add_four_EV2(x):
    return x + 20000
    
pred_4 = model_df[['EV_LEVEL_1', 'EV_LEVEL_2', 'EV_FAST', 'EV_TOTAL',
                    'NETWORK_TYPE_ChargePoint','NETWORK_TYPE_Non-Networked','NETWORK_TYPE_Other-Network',
                    'TOTAL_HOUSEHOLDS','MEDIAN_INCOME']]
pred_4['MEDIAN_INCOME'] = pred_4['MEDIAN_INCOME'].apply(add_four_EV2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [47]:
pred_4_df = generate_predictions(pred_4)

In [48]:
pred_4_df = pd.DataFrame(pred_4_df)

In [49]:
pred_4_df

Unnamed: 0_level_0,REG_PREDICT
ZIP_CODE,Unnamed: 1_level_1
03266,-13.710736
07001,456.795789
07003,1977.191316
07004,1328.034131
07005,1272.608880
...,...
99352,1488.970115
99354,2039.475413
99361,366.443404
99362,2427.039805
