In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.stem.snowball import SnowballStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
import scipy
from scipy.sparse import csr_matrix
from scipy import sparse
import datetime


In [2]:
#Read Data
train = pd.read_csv('Data/train_dataset.csv')
test = pd.read_csv('Data/test_dataset.csv')

In [None]:
#explore training Data

In [5]:
train.dtypes

ID             int64
Essayset     float64
min_score      int64
max_score      int64
score_1        int64
score_2        int64
score_3      float64
score_4      float64
score_5      float64
clarity       object
coherent      object
EssayText     object
dtype: object

In [None]:
#dataset has total 12 columns
#out of which 3 has text data

In [6]:
train.head()

Unnamed: 0,ID,Essayset,min_score,max_score,score_1,score_2,score_3,score_4,score_5,clarity,coherent,EssayText
0,1,1.0,0,3,1,1,1.0,1.0,1.0,average,worst,Some additional information that we would need...
1,2,1.0,0,3,1,1,,1.5,1.0,excellent,worst,"After reading the expirement, I realized that ..."
2,3,1.0,0,3,1,1,1.0,1.0,1.5,worst,above_average,"What you need is more trials, a control set up..."
3,4,1.0,0,3,0,0,0.0,0.0,1.0,worst,worst,The student should list what rock is better an...
4,5,1.0,0,3,2,2,2.0,2.5,1.0,above_average,worst,For the students to be able to make a replicat...


In [10]:
#look like there's some missing value..let's check

In [11]:
#missing data
total = train.isnull().sum().sort_values(ascending=False)
percent = ((train.isnull().sum()/ train.isnull().count()) * 100).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data

Unnamed: 0,Total,Percent
Essayset,157,0.921199
score_3,147,0.862524
coherent,145,0.850789
score_5,144,0.844922
clarity,138,0.809717
score_4,136,0.797982
EssayText,0,0.0
score_2,0,0.0
score_1,0,0.0
max_score,0,0.0


In [33]:
#6-columns have missing data

#filling essasy set
train['Essayset'] = train['Essayset'].fillna(train['Essayset'].mode()[0])

#filling missing scores
train['score_1'] = train['score_1'].fillna(train['score_1'].mode()[0])
train['score_2'] = train['score_2'].fillna(train['score_2'].mode()[0])
train['score_3'] = train['score_3'].fillna(train['score_3'].mode()[0])
train['score_4'] = train['score_4'].fillna(train['score_4'].mode()[0])
train['score_5'] = train['score_5'].fillna(train['score_5'].mode()[0])

#filling clarity and coherent
train['clarity'] = train['clarity'].fillna(train['score_5'].mode()[0])
train['coherent'] = train['coherent'].fillna(train['score_5'].mode()[0])

In [34]:
#lets check if still there's any missing value
train.isnull().sum()

ID           0
Essayset     0
min_score    0
max_score    0
score_1      0
score_2      0
score_3      0
score_4      0
score_5      0
clarity      0
coherent     0
EssayText    0
dtype: int64

In [56]:
#our targer variable is the avg score given to essay by 5 evulator
#but in trainging data we have 5 diff scores
#so we need to make a new var of avg score which we will use as target var
train['AvgScore'] = (train['score_1'] + train['score_2'] + train['score_3'] + train['score_4'] + train['score_5']) / 5

In [40]:
#our target var
y = train['AvgScore'].round()

In [43]:
#let's check for the essasy text
train['EssayText'].head()

0    Some additional information that we would need...
1    After reading the expirement, I realized that ...
2    What you need is more trials, a control set up...
3    The student should list what rock is better an...
4    For the students to be able to make a replicat...
Name: EssayText, dtype: object

In [44]:
#we can see that its contain stopwords and punct so we need to first clean this text

#function to clean raw text using nlp
def clean_text(raw_text):
    #remove spaces
    raw_text=raw_text.strip()
    #remove punct
    letters_only = re.sub("[^a-zA-Z]", " ", raw_text) 
    #lower caseing words
    words = letters_only.lower().split()                             
    #stopwords removal
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops]
    #stemming
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    meaningful_words1=[stemmer.stem(word) for word in meaningful_words]
    

    return(" ".join(meaningful_words1))

In [45]:
#len of essay text before cleaning
train['EssayText_before_len']=train['EssayText'].apply(len)
#cleaning essay text
train['EssayText']=train['EssayText'].apply(clean_text)
#len of essay text before cleaning
train['EssayText_after_len']=train['EssayText'].apply(len)

#no of stopwords,punc removed
train['Stopword_Used'] = train['EssayText_before_len'] - train['EssayText_after_len']

#we can use this feature in training

In [47]:
#convert categorical text var to numerical dummy vars
train=pd.get_dummies(train,columns=['clarity','coherent'],drop_first=True)

In [49]:
#lets see no of features we have for traing
feats =['max_score', 'EssayText_before_len', 'EssayText_after_len',
        'clarity_average', 'clarity_excellent','clarity_worst', 
        'coherent_average', 'coherent_excellent','coherent_worst']


In [50]:
#using tf idf or count Vector
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

vec_cs = TfidfVectorizer(ngram_range=(1,3),stop_words="english", analyzer='word')
consum_comp_sum = vec_cs.fit_transform(train['EssayText'])

vec_cs_char = TfidfVectorizer(ngram_range=(1,9),stop_words="english", analyzer='char')
consum_comp_sum_char =vec_cs_char.fit_transform(train['EssayText'])


final_features = scipy.sparse.hstack((train[feats], consum_comp_sum, consum_comp_sum_char)).tocsr()


#so now have training data ready to model

In [51]:
#lets now prepare the test data in the same way
test.isnull().sum()
#traing data does not have any missing value so no need to worry

ID           0
Essayset     0
min_score    0
max_score    0
clarity      0
coherent     0
EssayText    0
dtype: int64

In [52]:
#do the same for test file
test['EssayText_before_len']=test['EssayText'].apply(len)
test['EssayText']=test['EssayText'].apply(clean_text)
test['EssayText_after_len']=test['EssayText'].apply(len)
test['Stopword_Used'] = test['EssayText_before_len'] - test['EssayText_after_len']

#change Text Category to label encoding
test=pd.get_dummies(test,columns=['clarity','coherent'],drop_first=True)

consum_comp_sum_test = vec_cs.transform(test['EssayText'])
consum_comp_sum_test_char = vec_cs_char.transform(test['EssayText'])

final_features_test = scipy.sparse.hstack((test[feats], consum_comp_sum_test, consum_comp_sum_test_char)).tocsr()


In [53]:
#Apply model to predict
#logistic 
lr=LogisticRegression(verbose=1,class_weight='balanced',C=5,random_state=1996,n_jobs=-1)
lr.fit(final_features,y)
lr_pred=lr.predict(final_features_test)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


[LibLinear]

In [54]:
#make Submission - id,essay_set,essay_score
submission=pd.DataFrame(
                {'id':test['ID'],
                 'essay_set' :test['Essayset'],
                 'essay_score':lr_pred}
              )

submission.head()
submission.to_csv('submission_LR_' + datetime.datetime.now().strftime("%d%m%Y%H%M%S") + '.csv',index=False)