In [1]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
pd.set_option('display.max_columns', 25)

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LassoCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn import metrics

from tqdm import tqdm_notebook

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import statsmodels.api as sm
from statsmodels.api import OLS

from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

import statsmodels.api as sm
from statsmodels.api import OLS

from pandas.plotting import scatter_matrix
from collections import Counter

import seaborn as sns
sns.set(style='whitegrid')
pd.set_option('display.width', 1500)
pd.set_option('display.max_columns', 100)

In [2]:
# Load data
data = pd.read_csv('data/full_eda_data.csv')

In [3]:
predictors = ['track_number', 'track_count', 'duration', 'explicit', 
              'danceability', 'energy', 'loudness', 'speechiness', 
              'acousticness', 'instrumentalness', 'liveness', 
              'valence', 'tempo']
response = 'comparative_pop'

In [4]:
# Split into train and test sets
np.random.seed(109)

X_train, X_test, y_train, y_test = train_test_split(data.loc[:, data.columns != response], 
                                                         data[response], test_size=0.2, 
                                                         random_state = 109)

In [5]:
# your code here
X_train = X_train.copy()[predictors]
X_test = X_test.copy()[predictors]

X_train_const = sm.add_constant(X_train)
lin_model = sm.OLS(y_train, X_train_const).fit()
print(lin_model.summary())

                            OLS Regression Results                            
Dep. Variable:        comparative_pop   R-squared:                       0.084
Model:                            OLS   Adj. R-squared:                  0.084
Method:                 Least Squares   F-statistic:                     212.3
Date:                Wed, 20 Nov 2019   Prob (F-statistic):               0.00
Time:                        20:16:14   Log-Likelihood:            -1.1189e+05
No. Observations:               30131   AIC:                         2.238e+05
Df Residuals:                   30117   BIC:                         2.239e+05
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const               -8.7518      0.635  

  return ptp(axis=axis, out=out, **kwargs)


In [6]:
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)

train_score = lin_model.score(X_train, y_train)
test_score = lin_model.score(X_test, y_test)

pd.DataFrame([["Linear Model", train_score, test_score]], columns = ['Model', 'Training Score', 'Test Score'])

Unnamed: 0,Model,Training Score,Test Score
0,Linear Model,0.083961,0.079604


In [7]:
# kNN iteratively to see which fits best
score_set = []

for k in range(50, 1000, 50):
    scores = [k]
    
    model = KNeighborsRegressor(n_neighbors = k)
    # fit on the training set of data
    model.fit(X_train, y_train)
    
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    scores.append(train_score)
    scores.append(test_score)
    score_set.append(scores)

score_frame = pd.DataFrame(score_set, columns = ['k', 'Training Score', 'Test Score'])
score_frame.sort_values(by = ['Test Score'], ascending=False).head()

Unnamed: 0,k,Training Score,Test Score
3,200,0.024366,0.018729
5,300,0.021272,0.018253
4,250,0.022345,0.017645
6,350,0.020553,0.017288
7,400,0.019862,0.017044


In [8]:
# Add interaction terms to dataset
X_train_int = X_train.copy()
X_test_int = X_test.copy()
for first_col in X_train.columns:
    for second_col in X_train.columns.drop(first_col):
        X_train_int[first_col + "*" + second_col] = X_train[first_col]*X_train[second_col]
        X_test_int[first_col + "*" + second_col] = X_test[first_col]*X_test[second_col]

In [9]:
# Linear regression with interaction terms
lin_model = LinearRegression()
lin_model.fit(X_train_int, y_train)

train_score = lin_model.score(X_train_int, y_train)
test_score = lin_model.score(X_test_int, y_test)

pd.DataFrame([["Linear Model", train_score, test_score]], columns = ['Model', 'Training Score', 'Test Score'])

Unnamed: 0,Model,Training Score,Test Score
0,Linear Model,0.111528,0.093626


In [10]:
# kNN with interaction terms
# kNN iteratively to see which fits best
score_set = []

for k in range(50, 1000, 50):
    scores = [k]
    
    model = KNeighborsRegressor(n_neighbors = k)
    # fit on the training set of data
    model.fit(X_train_int, y_train)
    
    train_score = model.score(X_train_int, y_train)
    test_score = model.score(X_test_int, y_test)
    scores.append(train_score)
    scores.append(test_score)
    score_set.append(scores)

score_frame = pd.DataFrame(score_set, columns = ['k', 'Training Score', 'Test Score'])
score_frame.sort_values(by = ['Test Score'], ascending=False).head()

Unnamed: 0,k,Training Score,Test Score
1,100,0.108242,0.087705
2,150,0.102083,0.084606
3,200,0.098556,0.083986
0,50,0.119722,0.081886
4,250,0.095503,0.081864
