### Import things

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression, RidgeCV, LassoCV
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [2]:
df = pd.read_csv('./CSVs/all_clean.csv')

In [3]:
#drop passenger to frankfurt, and put it in a different df
frankfurt = df[df['book_title'].str.lower() == 'passenger to frankfurt'].reset_index(drop=True)
df = df[df['book_title'].str.lower() != 'passenger to frankfurt'].reset_index(drop=True)

### Drop the features that aren't normalized

In [4]:
frankfurt['decade_written'] = frankfurt['year_written'].map(lambda x: str(x)[:3] + '0')

In [5]:
df.columns

Index(['book_title', 'book_text', 'copyright', 'year_written',
       'sentence_count', 'word_count', 'syllable_count', 'flesch_reading_ease',
       'avg_words_per_sentence', 'avg_syllables_per_word', 'stemmed_text',
       'unique_word_count', 'unique_words_%', 'total_adj_count',
       'total_noun_count', 'adj_word_list', 'noun_word_list',
       'unique_adj_count', 'unique_noun_count', 'something', 'thing',
       'anything', 'repeated_trigrams', 'total_pos_sentences',
       'total_neu_sentences', 'total_neg_sentences', '%_pos_sentences',
       '%_neu_sentences', '%_neg_sentences', '%_repeated_trigrams',
       '%_unique_adj', '%_unique_noun', '%_adj_in_text', '%_noun_in_text',
       '%_something_in_text', '%_thing_in_text', '%_anything_in_text',
       'stemmed_removed_stopwords', 'only_stopwords', 'stopwords_count',
       '%_stop_words', 'stuff_count', '%_marchbolt', 'lot_count', 'very_count',
       '%_stuff', '%_lot', '%_very'],
      dtype='object')

In [6]:
#drop the columns that are just counts (not normalized):
to_drop = ['book_title', 'book_text', 'copyright','sentence_count', 
           'word_count', 'syllable_count', 'stemmed_text',
           'unique_word_count','total_adj_count',
           'total_noun_count', 'adj_word_list', 'noun_word_list',
           'unique_adj_count', 'unique_noun_count', 'something', 'thing',
           'anything', 'repeated_trigrams', 'total_pos_sentences',
           'total_neu_sentences', 'total_neg_sentences','stemmed_removed_stopwords', 
           'only_stopwords', 'stopwords_count', 'stuff_count', 
           '%_marchbolt', 'lot_count', 'very_count']

train = df.drop(to_drop, axis=1)
train['decade_written'] = train['year_written'].map(lambda x: int(str(x)[:3] + '0'))
frankfurt = frankfurt.drop(to_drop, axis=1)

In [7]:
train.to_csv('./CSVs/train.csv')

In [8]:
train.columns

Index(['year_written', 'flesch_reading_ease', 'avg_words_per_sentence',
       'avg_syllables_per_word', 'unique_words_%', '%_pos_sentences',
       '%_neu_sentences', '%_neg_sentences', '%_repeated_trigrams',
       '%_unique_adj', '%_unique_noun', '%_adj_in_text', '%_noun_in_text',
       '%_something_in_text', '%_thing_in_text', '%_anything_in_text',
       '%_stop_words', '%_stuff', '%_lot', '%_very', 'decade_written'],
      dtype='object')

### Make X and y and scale the data

In [9]:
X = train.drop(['year_written', 'decade_written'], axis=1)
y = train['decade_written']

In [10]:
#need to stratify because classes are uneven
X_train, X_test,y_train, y_test = train_test_split(X, y, stratify=y, random_state = 60)

In [11]:
sc = StandardScaler()

In [12]:
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

### Baseline
The dummy regressor always predicts the mean, which is 1941

In [13]:
y_train.astype(int).mean()

1941.063829787234

In [14]:
dummy = DummyRegressor(strategy='mean')
dummy.fit(X_train_scaled, y_train)

DummyRegressor()

In [15]:
print('Baseline:''\nTrain Score:', dummy.score(X_train_scaled, y_train),'\nTest Score:', dummy.score(X_test_scaled, y_test))

Baseline:
Train Score: 0.0 
Test Score: -0.00017466128187137997


### Trying different models

In [16]:
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)
print('Model: Logistic Regression''\nTrain Score:', logreg.score(X_train_scaled, y_train),'\nTest Score:', logreg.score(X_test_scaled, y_test))

Model: Logistic Regression
Train Score: 1.0 
Test Score: 0.1875


In [17]:
linear = LinearRegression()
linear.fit(X_train_scaled, y_train)
print('Model: Linear Regression''\nTrain Score:', linear.score(X_train_scaled, y_train),'\nTest Score:', linear.score(X_test_scaled, y_test))

Model: Linear Regression
Train Score: 0.8849913309649478 
Test Score: 0.6766163331829248


In [18]:
lasso = LassoCV()
lasso.fit(X_train_scaled, y_train)
print('Model: Lasso Regression', '\nTrain Score:', lasso.score(X_train_scaled, y_train),'\nTest Score:', lasso.score(X_test_scaled, y_test))

Model: Lasso Regression 
Train Score: 0.8430301033955295 
Test Score: 0.7025826641956279


In [19]:
ridge = RidgeCV()
ridge.fit(X_train_scaled, y_train)
print('Model: Ridge Regression''\nTrain Score:', ridge.score(X_train_scaled, y_train),'\nTest Score:', ridge.score(X_test_scaled, y_test))

Model: Ridge Regression
Train Score: 0.8741075597430329 
Test Score: 0.7075773773621976


In [20]:
forest = RandomForestRegressor()
forest.fit(X_train_scaled, y_train)
print('Model: Random Forest Regression''\nTrain Score:', forest.score(X_train_scaled, y_train),'\nTest Score:', forest.score(X_test_scaled, y_test))

Model: Random Forest Regression
Train Score: 0.9368749764816553 
Test Score: 0.4774330708661454
