### Import things

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression, RidgeCV, LassoCV
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [2]:
df = pd.read_csv('./CSVs/all_clean.csv')

In [3]:
#drop passenger to frankfurt
df = df[df['book_title'].str.lower() != 'passenger to frankfurt'].reset_index(drop=True)

### Drop the features that aren't normalized

In [4]:
df.columns

Index(['book_title', 'book_text', 'year_written', 'sentence_count',
       'word_count', 'syllable_count', 'flesch_reading_ease',
       'avg_words_per_sentence', 'avg_syllables_per_word', 'stemmed_text',
       'unique_word_count', 'unique_words_%', 'total_adj_count',
       'unique_adj_count', 'total_noun_count', 'unique_noun_count', 'thing',
       '%_thing', 'something', '%_something', 'anything', '%_anything',
       'stuff', '%_stuff', 'lot', '%_lot', 'very', '%_very',
       'repeated_trigrams', 'total_pos_sentences', 'total_neu_sentences',
       'total_neg_sentences', '%_pos_sentences', '%_neu_sentences',
       '%_neg_sentences', 'stemmed_removed_stopwords', 'only_stopwords',
       'stopwords_count', '%_stop_words', '%_repeated_trigrams',
       '%_unique_adj', '%_unique_noun', '%_adj_in_text', '%_noun_in_text'],
      dtype='object')

In [5]:
#drop the columns that are just counts (not normalized):
to_drop = ['book_title', 'book_text', 'sentence_count', 
              'word_count', 'syllable_count','stemmed_text',
              'unique_word_count', 'unique_words_%', 'total_adj_count',
              'unique_adj_count', 'total_noun_count', 'unique_noun_count', 
              'thing','anything', 'stuff', 'something', 'lot','very',
              'repeated_trigrams', 'total_pos_sentences',
              'total_neu_sentences', 'total_neg_sentences', 'stemmed_removed_stopwords',
              'only_stopwords', 'stopwords_count']

train = df.drop(to_drop, axis=1)
train['decade_written'] = train['year_written'].map(lambda x: int(str(x)[:3] + '0'))

short_stories = short_stories.drop(to_drop, axis=1)
short_stories['decade_written'] = short_stories['year_written'].map(lambda x: int(str(x)[:3] + '0'))

In [6]:
train.to_csv('./CSVs/train.csv')

In [7]:
train.columns

Index(['year_written', 'flesch_reading_ease', 'avg_words_per_sentence',
       'avg_syllables_per_word', '%_thing', '%_something', '%_anything',
       '%_stuff', '%_lot', '%_very', '%_pos_sentences', '%_neu_sentences',
       '%_neg_sentences', '%_stop_words', '%_repeated_trigrams',
       '%_unique_adj', '%_unique_noun', '%_adj_in_text', '%_noun_in_text',
       'decade_written'],
      dtype='object')

### Make X and y and scale the data

In [8]:
X = train.drop(['year_written', 'decade_written'], axis=1)
y = train['decade_written']

In [9]:
#need to stratify because classes are uneven
X_train, X_test,y_train, y_test = train_test_split(X, y, stratify=y, random_state = 60)

In [10]:
sc = StandardScaler()

In [11]:
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

### Baseline
The dummy regressor always predicts the mean, which is 1941

In [12]:
y_train.astype(int).mean()

1941.063829787234

In [13]:
dummy = DummyRegressor(strategy='mean')
dummy.fit(X_train_scaled, y_train)

DummyRegressor()

In [14]:
print('Baseline:''\nTrain Score:', dummy.score(X_train_scaled, y_train),'\nTest Score:', dummy.score(X_test_scaled, y_test))

Baseline:
Train Score: 0.0 
Test Score: -0.00017466128187137997


### Trying different models

In [15]:
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)
print('Model: Logistic Regression''\nTrain Score:', logreg.score(X_train_scaled, y_train),'\nTest Score:', logreg.score(X_test_scaled, y_test))

Model: Logistic Regression
Train Score: 0.9361702127659575 
Test Score: 0.25


In [16]:
linear = LinearRegression()
linear.fit(X_train_scaled, y_train)
print('Model: Linear Regression''\nTrain Score:', linear.score(X_train_scaled, y_train),'\nTest Score:', linear.score(X_test_scaled, y_test))

Model: Linear Regression
Train Score: 0.8771176939829655 
Test Score: 0.6966793509800457


In [17]:
lasso = LassoCV(cv=5)
lasso.fit(X_train_scaled, y_train)
print('Model: Lasso Regression', '\nTrain Score:', lasso.score(X_train_scaled, y_train),'\nTest Score:', lasso.score(X_test_scaled, y_test))

Model: Lasso Regression 
Train Score: 0.8467731296586589 
Test Score: 0.7270014933020995


In [18]:
ridge = RidgeCV(cv=5)
ridge.fit(X_train_scaled, y_train)
print('Model: Ridge Regression''\nTrain Score:', ridge.score(X_train_scaled, y_train),'\nTest Score:', ridge.score(X_test_scaled, y_test))

Model: Ridge Regression
Train Score: 0.834568688969893 
Test Score: 0.7114815582090936


In [19]:
forest = RandomForestRegressor()
forest.fit(X_train_scaled, y_train)
print('Model: Random Forest Regression''\nTrain Score:', forest.score(X_train_scaled, y_train),'\nTest Score:', forest.score(X_test_scaled, y_test))

Model: Random Forest Regression
Train Score: 0.944582666980245 
Test Score: 0.45108346456692827


In [20]:
boost = GradientBoostingRegressor(min_samples_split = 5)
boost.fit(X_train_scaled, y_train)
print('Model: Gradient Boost''\nTrain Score:', boost.score(X_train_scaled, y_train),'\nTest Score:', boost.score(X_test_scaled, y_test))

Model: Gradient Boost
Train Score: 0.9999568791437801 
Test Score: 0.3559345265410766


In [21]:
real_years = df.loc[y_test.index]['year_written'].reset_index(drop=True)

In [22]:
preds = pd.Series(lasso.predict(X_test_scaled)).astype(int)

In [23]:
preds_and_real = pd.concat([preds, real_years], axis=1)
preds_and_real.columns = ['predictions', 'actual']

In [24]:
preds_and_real['difference'] = abs(preds_and_real['actual'] - preds_and_real['predictions'])

In [25]:
preds_and_real

Unnamed: 0,predictions,actual,difference
0,1976,1973,3
1,1936,1936,0
2,1943,1956,13
3,1945,1949,4
4,1929,1935,6
5,1936,1931,5
6,1930,1927,3
7,1925,1922,3
8,1932,1932,0
9,1934,1941,7
