**Imports & reading processed csv** 

In [1]:
# Imports and It reads the CSV file and drops the 'Unnamed: 0' column. 

# Data manipulation(cleaning), Plot 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Pre-processing
import nltk
from nltk.corpus import stopwords; # nltk.download('stopwords');
from nltk import tokenize
from nltk import pos_tag
from nltk.tokenize import word_tokenize; # nltk.download('punkt'); nltk.download('averaged_perceptron_tagger');
import re
from spellchecker import SpellChecker
from collections import Counter

# Prep ML - Split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# ML models
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import pickle
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

myDataset = pd.read_csv("myFeatures.csv")
myDataset.drop('Unnamed: 0',inplace=True,axis=1)
myDataset.head(2)

Unnamed: 0,normalized_score,clean_essay,Sat500,char_count,word_count,sentences_count,spelling_mistake_count,avg_word_len,count_nouns,count_adjectives,count_adverts,count_verbs
0,6,dear local newspaper think effects computers p...,0,904,157,1,12,5.757962,68,29,12,36
1,7,dear believe using computers benefit us many w...,6,1220,217,1,11,5.62212,95,24,10,61


**Count Vectorizer** builds a numerical representation of the text provided in the "clean_essay" column of the myDataset dataframe. It extracts the most important words and phrases from the text and then creates feature vectors that describe each example in the dataset.

In [21]:
# stopwords = a an I (small words) ngram_range = range of wards in the vector like two words together | max_features to limit train set

vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,5),max_features=20000)
cv = vectorizer.fit_transform(myDataset['clean_essay'])
X = cv.toarray()
cv = 0; # Too free up ram
vocab = vectorizer.vocabulary_
vocab

{'dear': 4223,
 'local': 10057,
 'newspaper': 11761,
 'think': 17775,
 'effects': 5163,
 'computers': 3244,
 'people': 12998,
 'great': 7264,
 'learning': 9360,
 'time': 18052,
 'chat': 2607,
 'helps': 7764,
 'learn': 9282,
 'globe': 6902,
 'keeps': 8736,
 'thing': 17595,
 'dont': 4906,
 'feel': 6085,
 'teenager': 17390,
 'phone': 13529,
 'friends': 6558,
 'buisness': 2151,
 'partner': 12830,
 'things': 17648,
 'new': 11695,
 'way': 19301,
 'computer': 3072,
 'plenty': 13703,
 'sites': 16106,
 'internet': 8581,
 'facebook': 5738,
 'myspace': 11430,
 'ect': 5121,
 'setting': 15698,
 'meeting': 10813,
 'boss': 1774,
 'fun': 6672,
 'rushing': 15079,
 'cause': 2413,
 'want': 19056,
 'use': 18736,
 'outside': 12509,
 'going': 6914,
 'child': 2657,
 'spends': 16428,
 'lot': 10192,
 'ask': 766,
 'question': 14130,
 'economy': 5119,
 'sea': 15537,
 'floor': 6343,
 'spreading': 16462,
 'surprise': 17108,
 'heshe': 7782,
 'knows': 8982,
 'believe': 1245,
 'interesting': 8572,
 'class': 2842,
 'd

In [22]:
pickle.dump(vectorizer,open("Models/vectorizer",'wb'))

In [2]:
vectorizer = pickle.load(open("Models/vectorizer",'rb'))
cv = vectorizer.transform(myDataset['clean_essay'])
X = cv.toarray()
cv = 0; # Too free up ram
vocab = vectorizer.vocabulary_
vocab

{'dear': 4223,
 'local': 10057,
 'newspaper': 11761,
 'think': 17775,
 'effects': 5163,
 'computers': 3244,
 'people': 12998,
 'great': 7264,
 'learning': 9360,
 'time': 18052,
 'chat': 2607,
 'helps': 7764,
 'learn': 9282,
 'globe': 6902,
 'keeps': 8736,
 'thing': 17595,
 'dont': 4906,
 'feel': 6085,
 'teenager': 17390,
 'phone': 13529,
 'friends': 6558,
 'buisness': 2151,
 'partner': 12830,
 'things': 17648,
 'new': 11695,
 'way': 19301,
 'computer': 3072,
 'plenty': 13703,
 'sites': 16106,
 'internet': 8581,
 'facebook': 5738,
 'myspace': 11430,
 'ect': 5121,
 'setting': 15698,
 'meeting': 10813,
 'boss': 1774,
 'fun': 6672,
 'rushing': 15079,
 'cause': 2413,
 'want': 19056,
 'use': 18736,
 'outside': 12509,
 'going': 6914,
 'child': 2657,
 'spends': 16428,
 'lot': 10192,
 'ask': 766,
 'question': 14130,
 'economy': 5119,
 'sea': 15537,
 'floor': 6343,
 'spreading': 16462,
 'surprise': 17108,
 'heshe': 7782,
 'knows': 8982,
 'believe': 1245,
 'interesting': 8572,
 'class': 2842,
 'd

In [14]:
myDataset.iloc[:, 2:]

Unnamed: 0,Sat500,char_count,word_count,sentences_count,spelling_mistake_count,avg_word_len,count_nouns,count_adjectives,count_adverts,count_verbs
0,0,904,157,1,12,5.757962,68,29,12,36
1,6,1220,217,1,11,5.622120,95,24,10,61
2,6,789,132,1,2,5.977273,71,19,3,36
3,5,1626,262,1,21,6.206107,125,42,13,66
4,2,1317,225,1,9,5.853333,110,24,15,49
...,...,...,...,...,...,...,...,...,...,...
12971,7,1936,367,1,1,5.275204,129,55,48,81
12972,7,1210,248,1,5,4.879032,93,41,37,57
12973,10,2060,346,1,11,5.953757,131,56,32,95
12974,9,1315,236,1,2,5.572034,81,41,38,59


**Target** in y axis, whereas all the features into x axis

In [3]:
Y = myDataset['normalized_score'].values
X = np.concatenate((myDataset.iloc[:, 2:], X), axis = 1)
myCheck = pd.DataFrame(X)
myCheck
print("My X", myCheck)
print("My Y: ", Y)
myCheck = 0

My X        0       1      2      3      4         5      6      7      8      \
0        0.0   904.0  157.0    1.0   12.0  5.757962   68.0   29.0   12.0   
1        6.0  1220.0  217.0    1.0   11.0  5.622120   95.0   24.0   10.0   
2        6.0   789.0  132.0    1.0    2.0  5.977273   71.0   19.0    3.0   
3        5.0  1626.0  262.0    1.0   21.0  6.206107  125.0   42.0   13.0   
4        2.0  1317.0  225.0    1.0    9.0  5.853333  110.0   24.0   15.0   
...      ...     ...    ...    ...    ...       ...    ...    ...    ...   
12971    7.0  1936.0  367.0    1.0    1.0  5.275204  129.0   55.0   48.0   
12972    7.0  1210.0  248.0    1.0    5.0  4.879032   93.0   41.0   37.0   
12973   10.0  2060.0  346.0    1.0   11.0  5.953757  131.0   56.0   32.0   
12974    9.0  1315.0  236.0    1.0    2.0  5.572034   81.0   41.0   38.0   
12975    4.0  1248.0  215.0    1.0    1.0  5.804651   78.0   50.0   21.0   

       9      ...  20000  20001  20002  20003  20004  20005  20006  20007  \
0    

**train_test_split** splits the data into training and test datasets. It then prints the size of each of these datasets.

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.30)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(9083, 20010) (3893, 20010) (9083,) (3893,)


**Machine Learning Models**

**Linear Regression**

In [5]:
# Run only to build the model - it takes 7 mins to train
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train)
y_pred = linear_regressor.predict(X_test)
pickle.dump(linear_regressor,open("Models/LR",'wb'))
print('Intercept:',linear_regressor.intercept_)
print('Slope:', linear_regressor.coef_)
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))

Intercept: 299143018.3178303
Slope: [-2.03683174e+07 -1.11560467e+06  8.62203133e+06 ...  1.66972630e+08
 -4.57095785e+07  2.85053794e+07]
Mean squared error: 1073346938288667520.00


In [5]:
linear_regressor = pickle.load(open("Models/LR",'rb'))
y_pred = linear_regressor.predict(X_test)
print('Intercept:',linear_regressor.intercept_)
print('Slope:', linear_regressor.coef_)
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
print("y_pred: ", y_pred)

Intercept: 299143018.3178303
Slope: [-2.03683174e+07 -1.11560467e+06  8.62203133e+06 ...  1.66972630e+08
 -4.57095785e+07  2.85053794e+07]


NameError: name 'y_pred' is not defined

As you can see above by using **linear regression** we have a very bad mean squared error which mean the our predictions of the grades will be very bad and illogical therefore now we will implement the random forest model to see how much better is it than the linear regression.

**Random Forest Regressor**

In [6]:
# Only to train the model - takes 16 mins
rf = RandomForestRegressor(n_jobs=6)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
pickle.dump(rf,open("Models/RF",'wb'))

Mean squared error: 2.06


In [8]:
rf = pickle.load(open("Models/RF",'rb'))
y_pred = rf.predict(X_test)
r2 = r2_score(y_test, y_pred)

# Evaluating the model using MAE Evaluation Metric
print("Mean absoute error: ", mean_absolute_error(y_test, y_pred))
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
print("R2 Score: ", r2)
print("y_pred: ", y_pred)

Mean absoute error:  0.6134960184947342
Mean squared error: 0.91
R2 Score:  0.8513090978914925
y_pred:  [2.71 8.57 5.74 ... 8.79 4.99 2.98]


As you can see by the results above we have greatly improved the mean squared error by moving towards Random forest model. I will now try to tune the model.

**Tuning the Random Forest**

In [18]:
rf = RandomForestRegressor(n_estimators = 2000,random_state=12,n_jobs=6)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
pickle.dump(rf,open("Models/RFtuned",'wb'))

Mean squared error: 2.03


In [9]:
rf = pickle.load(open("Models/RFtuned",'rb'))
y_pred = rf.predict(X_test)
r2 = r2_score(y_test, y_pred)

# Evaluating the model using MAE Evaluation Metric
print("Mean absoute error: ", mean_absolute_error(y_test, y_pred))
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
print("R2 Score: ", r2)
print("y_pred: ", y_pred)

Mean absoute error:  0.5975148835516739
Mean squared error: 0.84
R2 Score:  0.8621468535953871
y_pred:  [3.4425     8.30791667 3.167      ... 9.557      4.969      2.89466667]


In [10]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [11]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 30, cv = 3, verbose=2, random_state=42, n_jobs = 6)# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


In [None]:
# Create a variable for the best model
best_rf = rand_search.best_estimator_
# Print the best hyperparameters
print('Best hyperparameters:',  rand_search.best_params_)