In [5]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import matplotlib

In [6]:
matplotlib.rcParams.update({'font.size': 12})
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor

from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [7]:
def save_prediction_to_csv(y_pred):
    """
    Use this function to save your prediction result to a csv file.
    The resulting csv file is named as [team_name].csv

    :param y_pred: an array or a pandas series that follows the SAME index order as in the testing data
    """
    pd.DataFrame(dict(
        target=y_pred
    )).to_csv('predictions.csv', index=False, header=False)

In [8]:
def prepare_data(thresh = 0.85):

    #### Read Train and Test Data
    training_data = pd.read_csv('/home/rajneesh/Desktop/Queen_City_Hackathon/training.csv', index_col=0)
    testing_data = pd.read_csv('/home/rajneesh/Desktop/Queen_City_Hackathon/testing.csv', index_col=0)

    ### Check for the co relared columns 

    # Create correlation matrix
    corr_matrix = training_data.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    # Find index of feature columns with correlation greater than 0.95
    columns_to_drop = [column for column in upper.columns if any(upper[column] > thresh)]
    if 'target' in columns_to_drop:
        columns_to_drop.remove('target')

    ### Drop all the co-related columns
    data_nodups_df = training_data.drop(columns=columns_to_drop)
    
    #### Data Preparation
    train_df = data_nodups_df.fillna(data_nodups_df.mean())

    #y = training_data['target']
    #X = data_nonull_df.drop(columns=['target'])

    test_nodups_df = testing_data.drop(columns=columns_to_drop)
    test_df = test_nodups_df.fillna(data_nodups_df.mean())
    
    return (train_df, test_df)

In [9]:
(train_df, test_df) = prepare_data(thresh=0.85)
y = train_df['target']
X = train_df.drop(columns = ['target'])


st_scale = StandardScaler()
X = st_scale.fit_transform(X)
test_df = st_scale.fit_transform(test_df)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=101)

**********
### Modeling

In [100]:
rf_pipe = Pipeline([('rforest', RandomForestRegressor())])

rf_pipe.fit(X_train, y_train)
print("Test score: {:.2f}".format(rf_pipe.score(X_test, y_test )))



Test score: 0.08


In [9]:
gbr_pipe = Pipeline([('rforest', GradientBoostingRegressor())])

gbr_pipe.fit(X_train, y_train)
print("Test score: {:.2f}".format(gbr_pipe.score(X_test, y_test )))

Test score: 0.10


In [11]:
abr_pipe = Pipeline([('rforest', AdaBoostRegressor())])

abr_pipe.fit(X_train, y_train)
print("Test score: {:.2f}".format(abr_pipe.score(X_test, y_test )))

Test score: 0.09


In [12]:
bg_pipe = Pipeline([('rforest', BaggingRegressor())])

bg_pipe.fit(X_train, y_train)
print("Test score: {:.2f}".format(bg_pipe.score(X_test, y_test )))

Test score: 0.10


In [14]:
print(mean_squared_error(y_test, rf_pipe.predict(X_test)))
print(mean_squared_error(y_test, gbr_pipe.predict(X_test)))
print(mean_squared_error(y_test, abr_pipe.predict(X_test)))
print(mean_squared_error(y_test, bg_pipe.predict(X_test)))

1.4650672391751234
1.4034649813744355
1.3942168818264178
1.381505006356169


In [118]:
params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.1, 'max_depth': 5, 'alpha': 10}
voting_clf = VotingRegressor([('knn', KNeighborsRegressor(n_neighbors=3)), ('ridge', Ridge(alpha=10)), ('lasso', Lasso(alpha=10)), ('xgb', xgb.XGBRegressor())])

voting_clf.fit(X_train, y_train)

  if getattr(data, 'base', None) is not None and \




VotingRegressor(estimators=[('knn',
                             KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                                 metric='minkowski',
                                                 metric_params=None,
                                                 n_jobs=None, n_neighbors=3,
                                                 p=2, weights='uniform')),
                            ('ridge',
                             Ridge(alpha=10, copy_X=True, fit_intercept=True,
                                   max_iter=None, normalize=False,
                                   random_state=None, solver='auto',
                                   tol=0.001)),
                            ('lasso',
                             Lasso(alpha=10, copy_X=True, fit_intercep...
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytr

In [119]:
print("Test score: {:.2f}".format(voting_clf.score(X_test, y_test )))
print(mean_squared_error(y_test, voting_clf.predict(X_test)))

Test score: 0.13
1.3333612642158403


In [120]:
voting_clf.fit(X, y)
y_pred = voting_clf.predict(test_df)
save_prediction_to_csv(y_pred)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




In [12]:
from mlxtend.regressor import StackingRegressor
lr = LinearRegression()
svr_lin = SVR(kernel='linear')
ridge = Ridge(random_state=1)
svr_rbf = SVR(kernel='rbf')

stregr = StackingRegressor(regressors=[svr_lin, lr, ridge], meta_regressor=svr_rbf)

# Training the stacking classifier

stregr.fit(X_train, y_train)
stregr.predict(X_test)

# Evaluate and visualize the fit

print("Mean Squared Error: %.4f" % np.mean((stregr.predict(X_train) - y_train) ** 2))
print('Variance Score: %.4f' % stregr.score(X_train, y_train))

print("Test score: {:.2f}".format(stregr.score(X_train, y_train)))
print(mean_squared_error(y_test, stregr.predict(X_test)))

Mean Squared Error: 0.0164
Variance Score: 0.7910
Test score: 0.79
1.3978444353592037


