Gebil Jibul

Description: This program demonstrates the application of machine learning algorithms on sample data.

I will use an algorithm to predict y (i.e., the column in the data labeled ‘y’). Note that the target is a continuous numerical variable (this is a regression problem). I can optionally print out the R2 score (a statistical measure that represents the proportion of the variance for a dependent variable that's explained by an independent variable or variables in a regression model).

##### Data & Cleaning

In [170]:
import pandas as pd

eda_data = pd.read_csv('data/eda_data.csv')

In [171]:
import re

# Function performs all cleaning opperations
def str_to_int(series):
    
    # Replaces NaN with string '0'. Instances converted to float later to prevent error
    series = series.fillna('0')
    
    # Removes characters preventing conversion to float
    series = series.replace('[$|,]', '', regex=True)
    
    # Converts to float; makes negative only if contained within parentheses
    conversion =  lambda x: float(re.sub('[(|)]', '', x))*-1 \
                            if '(' in x \
                            else float(x.strip('%'))/100 \
                            if '%' in x \
                            else float(x)
    
    return series.apply(conversion)

In [172]:
dirty_cols = ['x6', 'x10']

# Performs the cleaning operation on dirty columns
for col in dirty_cols:
    eda_data[col] = str_to_int(eda_data[col])

# Drops NaN
eda_data = eda_data.dropna()

##### Split & Normalization

In [173]:
from sklearn.model_selection import train_test_split

# Splits X, y
X = eda_data.copy()
y = X.pop('y')

# Splits train, test at 9:10 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [174]:
from sklearn.preprocessing import StandardScaler

# Normalizes data (all quantitative)
scaler = StandardScaler() # Initialize 
scaler.fit(X_train) # Fit transformer

X_train = scaler.transform(X_train) # Returns normalized data

In [175]:
pd.DataFrame(X_train).describe().round(4)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
count,8978.0,8978.0,8978.0,8978.0,8978.0,8978.0,8978.0,8978.0,8978.0,8978.0,8978.0,8978.0,8978.0,8978.0
mean,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0
std,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001
min,-3.902,-3.5537,-3.743,-3.6653,-4.0604,-3.5308,-4.4897,-4.9885,-3.7593,-4.4113,-3.8518,-4.1506,-3.879,-4.288
25%,-0.6767,-0.6742,-0.6698,-0.6678,-0.6658,-0.6979,-0.6646,-0.6621,-0.6676,-0.6189,-0.9613,-0.7006,-0.687,-0.6727
50%,0.0056,-0.009,0.0021,0.0014,-0.0149,-0.0231,0.0103,-0.0024,0.0035,0.022,0.0023,0.0256,-0.0278,0.0041
75%,0.6766,0.6681,0.6743,0.6602,0.6742,0.6595,0.6844,0.6653,0.6754,0.6633,0.9658,0.6791,0.6645,0.6632
max,3.7215,4.1306,3.5006,4.2405,3.893,3.904,3.7019,3.6612,4.7774,3.7243,3.8563,3.5599,3.9539,3.3327


##### Model Building

In [176]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor


# Linear Regression
ml_reg = LinearRegression() # Initialize
ml_reg.fit(X_train, y_train) # Fit model
y_pred_ml_reg = ml_reg.predict(X_train) # Returns predictions

# Lasso regression
lasso = Lasso(alpha=10) # Initialize
lasso.fit(X_train, y_train) # Fit model
y_pred_lasso = lasso.predict(X_train) # Returns predictions

# Nearest Neighbor
knn = KNeighborsRegressor(n_neighbors=12) # Initialize
knn.fit(X_train, y_train) # Fit model
y_pred_knn = knn.predict(X_train) # Returns predictions

##### Evaluation

In [177]:
X_test = scaler.transform(X_test) # Scales test data as train data was

In [178]:
from sklearn.metrics import mean_squared_error

# Builds MSE dataframe, compairs train vs test for all models
mse = pd.DataFrame(columns=['train', 'test'], index=['MLR','Lasso','KNN'])
model_dict = {'MLR': ml_reg, 'Lasso': lasso, 'KNN': knn}

# Calculates MSE for each element in MSE dataframe
for name, model in model_dict.items():
    mse.loc[name, 'train'] = mean_squared_error(y_true=y_train, y_pred=model.predict(X_train))
    mse.loc[name, 'test'] = mean_squared_error(y_true=y_test, y_pred=model.predict(X_test))
    
mse

Unnamed: 0,train,test
MLR,0.083183,0.083092
Lasso,0.083264,0.083092
KNN,0.075993,0.090238


In [180]:
# Uses 10 rows for quick look at performance
demo_pred = pd.DataFrame(X_test).iloc[:10].copy()
pred_dict = {'y_true': y_test[:10]}

# Builds dict with results
for name, model in model_dict.items():
    pred_dict['pred_'+name] = model.predict(demo_pred).round(10)

pd.DataFrame(pred_dict)

Unnamed: 0,y_true,pred_MLR,pred_Lasso,pred_KNN
5502,0.626702,0.487345,0.496439,0.346884
8937,0.228753,0.500683,0.496439,0.499971
8966,0.773118,0.501995,0.496439,0.559976
1105,0.02629,0.498423,0.496439,0.490555
4971,0.780776,0.478787,0.496439,0.49651
2384,0.883035,0.496509,0.496439,0.395432
2159,0.974479,0.497267,0.496439,0.526023
273,0.521794,0.491538,0.496439,0.578863
6857,0.324013,0.498065,0.496439,0.393862
5377,0.395995,0.501725,0.496439,0.699732
