In [6]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import gc 
import regex as re
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import spacy
pd.set_option('display.max_colwidth', 500)

In [4]:
df = pd.read_csv('data/df_01.csv')
df.head(1)

Unnamed: 0.1,Unnamed: 0,excerpt,target,word_count,char_count,period_count,ques_count,excl_count,sent_count
0,1705,"The commutator is peculiar, consisting of only three segments of a copper ring, while in the simplest of other continuous current generators several times that number exist, and frequently 120! segments are to be found. These three segments are made so as to be removable in a moment for cleaning or replacement. They are mounted upon a metal support, and are surrounded on all sides by a free air space, and cannot, therefore, lose their insulated condition. This feature of air insulation is pe...",0.0,177,1068,7,0,1,8


In [7]:
# Load English tokenizer, rtagger, parser and NER
nlp = spacy.load("en_core_web_sm")
doc = nlp(df.excerpt[0])
lemmatizer = nlp.get_pipe("lemmatizer")



In [8]:
def tokenize(text):
    # normalize case and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # tokenize text
    tokens = word_tokenize(text)
    
    # lemmatize and remove stop words
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# initialize tf-idf vectorizer object
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df.excerpt)

In [19]:
X = pd.DataFrame(X.toarray())

In [21]:
df = pd.concat([df, X], axis=1)

In [22]:
df.shape

(2834, 26842)

In [24]:
df.to_csv('data/df_vect.csv')

### Random Forest Decision Regressor - Root Mean Squared Error: 0.15382661962874808

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(df.drop(['excerpt','target', 'Unnamed: 0'], axis=1), df.target, test_size=0.33, random_state=27)


In [33]:
y_train

2030    0.625485
869     0.403285
61      0.109119
858     0.400387
1607    0.547773
          ...   
141     0.179607
1317    0.494885
752     0.376969
2591    0.769079
1043    0.440932
Name: target, Length: 1898, dtype: float64

In [34]:
rnd_reg = RandomForestRegressor(n_jobs=-1, random_state=27, criterion='mse',max_depth=16,
                                max_features='auto', max_leaf_nodes=64, n_estimators=1000)
rnd_reg.fit(X_train, y_train)

RandomForestRegressor(max_depth=16, max_leaf_nodes=64, n_estimators=1000,
                      n_jobs=-1, random_state=27)

In [35]:
y_pred = rnd_reg.predict(X_test)
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Root Mean Squared Error: 0.15382661962874808


### Support Vector Regressor

In [56]:
min_max_scaler = preprocessing.MinMaxScaler()
X_train = pd.DataFrame(min_max_scaler.fit_transform(X_train))
X_test = pd.DataFrame(min_max_scaler.fit_transform(X_test))
y_train = min_max_scaler.fit_transform(pd.DataFrame(y_train))[:,0]
y_test = min_max_scaler.fit_transform(pd.DataFrame(y_test))[:,0]


In [55]:
X_train.shape

(1898, 26839)

In [57]:
from sklearn.svm import SVR
svm_lin_reg = SVR(kernel="linear", C=100, epsilon=0.1)
svm_lin_reg.fit(X_train, y_train)


SVR(C=100, kernel='linear')

In [58]:
y_pred = svm_lin_reg.predict(X_test)
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Root Mean Squared Error: 0.14153505970773578


In [60]:
svm_poly_reg = SVR(kernel="poly", degree=3,C=100, epsilon=0.1)
svm_poly_reg.fit(X_train, y_train)
y_pred = svm_poly_reg.predict(X_test)
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Root Mean Squared Error: 0.20895050671393425


In [61]:
svm_rbf_reg = SVR(kernel="rbf", C=100, epsilon=0.1)
svm_rbf_reg.fit(X_train, y_train)
y_pred = svm_rbf_reg.predict(X_test)
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Root Mean Squared Error: 0.16256582443206286


In [65]:
from sklearn.model_selection import GridSearchCV
svr_poly = SVR(kernel="poly")
param_grid = { 
    'degree': [3,5,10],
    'C': [1, 100, 1000],
    'epsilon':[0.1,0.05]
}
cv_svr_poly = GridSearchCV(estimator=svr_poly, param_grid=param_grid, cv=5)
cv_svr_poly.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=SVR(kernel='poly'),
             param_grid={'C': [1, 100, 1000], 'degree': [3, 5, 10],
                         'epsilon': [0.1, 0.05]})

In [66]:
print(cv_svr_poly.best_params_)
y_pred = cv_svr_poly.predict(X_test)
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

{'C': 1, 'degree': 3, 'epsilon': 0.1}
Root Mean Squared Error: 0.2062639625718727


In [67]:
svr_rbf = SVR(kernel="rbf")
param_grid = { 
    'C': [1, 50,100]
}
cv_svr_rbf = GridSearchCV(estimator=svr_rbf, param_grid=param_grid, cv=5)
cv_svr_rbf.fit(X_train, y_train)
print(cv_svr_rbf.best_params_)
y_pred = cv_svr_rbf.predict(X_test)
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

{'C': 1}
Root Mean Squared Error: 0.16256582443206286
