# Import Libraries

In [103]:
import pandas as pd
import numpy as np
import spacy
import re
import sklearn
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from spacytextblob.spacytextblob import SpacyTextBlob
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from scipy import sparse
from tqdm import tqdm

In [4]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')

<spacytextblob.spacytextblob.SpacyTextBlob at 0x7fe2b08646d0>

# Load Source Data

In [5]:
train_data = pd.read_csv("feedback-prize-effectiveness/train.csv")
test_data = pd.read_csv("feedback-prize-effectiveness/test.csv")

In [6]:
print(train_data.shape, test_data.shape)

(36765, 5) (10, 4)


In [7]:
train_data.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [8]:
test_data.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim


In [9]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36765 entries, 0 to 36764
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   discourse_id             36765 non-null  object
 1   essay_id                 36765 non-null  object
 2   discourse_text           36765 non-null  object
 3   discourse_type           36765 non-null  object
 4   discourse_effectiveness  36765 non-null  object
dtypes: object(5)
memory usage: 1.4+ MB


In [10]:
train_data.discourse_type.unique()

array(['Lead', 'Position', 'Claim', 'Evidence', 'Counterclaim',
       'Rebuttal', 'Concluding Statement'], dtype=object)

In [11]:
train_data.discourse_effectiveness.unique()

array(['Adequate', 'Ineffective', 'Effective'], dtype=object)

##### TODO: Build a cross table

In [12]:
print("Count of Discourse Effectiveness")

train_data.discourse_effectiveness.value_counts()

Count of Discourse Effectiveness


Adequate       20977
Effective       9326
Ineffective     6462
Name: discourse_effectiveness, dtype: int64

In [13]:
print("Discourse Effectiveness Percent of Total")

train_data.discourse_effectiveness.value_counts() / train_data.shape[0]

Discourse Effectiveness Percent of Total


Adequate       0.570570
Effective      0.253665
Ineffective    0.175765
Name: discourse_effectiveness, dtype: float64

In [14]:
train_data.replace({"Ineffective":0,"Adequate":1,"Effective":2},inplace=True)

In [15]:
train_data.discourse_effectiveness.value_counts()

1    20977
2     9326
0     6462
Name: discourse_effectiveness, dtype: int64

In [16]:
train_data.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,1
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,1
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,1
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,1
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,1


# Load Spacy Pipeline

In [22]:
nlp = spacy.load("en_core_web_sm")

In [23]:
docs = [*nlp.pipe(train_data.discourse_text)]

# Functions

In [28]:
def pos_count(doc):
    nn_count = 0   #Noun
    pr_count = 0   #Pronoun
    vb_count = 0   #Verb
    adv_count = 0  #Adverb
    con_count = 0  #Conjuntion
    adj_count = 0  #Adjective
    intj_count = 0   #Interjection
    num_count = 0   #Numerics
    
    # sent = nltk.word_tokenize(sent)
    # sent = nltk.pos_tag(sent)

    for token in doc:
        if token.pos_ == 'NOUN': #Noun
            nn_count += 1

        if token.pos_ == "PRON": #Pronoun
            pr_count += 1
        
        if token.pos_ == "CONJ": #Conjunction
            con_count += 1

        if token.pos_ == 'VERB': #Verb
            vb_count += 1

        if token.pos_ == 'ADV': #Adverb
            adv_count += 1
        
        if token.pos_ == 'ADJ': #Adjective
            adj_count += 1

        if token.pos_ == 'INTJ': #Interjection
            intj_count += 1

        if token.pos_ == 'NUM': #Numerics
            num_count += 1
    
    return pd.Series([nn_count, pr_count, vb_count, adv_count, con_count, adj_count, intj_count, num_count])

In [35]:
def text_features(df, col,col2):
    df[f"{col}_num_words"] = df[col].apply(lambda x: len(str(x).split()))
    df[f"{col}_num_unique_words"] = df[col].apply(lambda x: len(set(str(x).split())))
    df[f"{col}_num_chars"] = df[col].apply(lambda x: len(str(x)))
    df[f"{col}_num_stopwords"] = df[col].apply(lambda x: len([w for w in str(x).lower().split() if w in nlp.Defaults.stop_words]))
    # df[f"{col}_num_punctuations"] = df[col].progress_apply(lambda x: len([c for c in str(x) if c in list(string.punctuation)]))
    df[f"{col}_num_words_upper"] = df[col].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
    df[f"{col}_num_words_title"] = df[col].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
    df[f"{col}_mean_word_len"] = df[col].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    df[f"{col}_num_paragraphs"] = df[col2].apply(lambda x: len(x.split('\n')))
    # df[f"{col}_num_contractions"] = df[col].progress_apply(contraction_count)
    # df[f"{col}_polarity"] = df[col].progress_apply(lambda x: TextBlob(x).sentiment[0])
    # df[f"{col}_subjectivity"] = df[col].progress_apply(lambda x: TextBlob(x).sentiment[1])
    df[[f'{col}_nn_count',f'{col}_pr_count',f'{col}_con_count',f'{col}_vb_count',f'{col}_adv_count',f'{col}_adj_count',f'{col}_intj_count',f'{col}_num_count']] = df[col].apply(pos_count)
    return df

# Multinomial Naive Bayes

In [36]:
train_data["discourse_text_docs"] = docs

In [37]:
train_data2 = text_features(train_data,"discourse_text_docs","discourse_text").copy()

In [39]:
cv = CountVectorizer()
X = train_data2.drop(columns={'discourse_effectiveness'}).copy()
y = train_data2.discourse_effectiveness.copy()
X_train2 = cv.fit_transform(X.discourse_text)

In [43]:
dummy_variables = pd.get_dummies(train_data2.discourse_type)

In [54]:
td3 = train_data2.merge(dummy_variables,left_on=train_data2.index, right_on= dummy_variables.index).iloc[:,7:].copy()

In [223]:
X_training_data = sparse.hstack((X_train2,td3))

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X_training_data,y,test_size=0.25,random_state=8)

mn_classifier = MultinomialNB(alpha=14)

mn_classifier.fit(X_train,y_train)

ypred = mn_classifier.predict(X_test)

# sklearn.metrics.confusion_matrix(y_test,ypred)

In [63]:
sklearn.metrics.accuracy_score(y_test,ypred)

0.6178198433420365

# Random Forests

In [68]:
rf_classifier = RandomForestClassifier(n_estimators=300
                                     , criterion="entropy"
                                     , max_depth=None
                                     , min_samples_split=2
                                     , min_samples_leaf=1
                                     , min_weight_fraction_leaf=0
                                     , max_features="sqrt"
                                     , max_leaf_nodes=None
                                     , min_impurity_decrease=0
                                     , bootstrap=True
                                     , oob_score=False
                                     , n_jobs=None
                                     , random_state=8
                                     , verbose=0
                                     , warm_start=False
                                     , class_weight=None
                                     , ccp_alpha=0
                                     , max_samples=None)

In [65]:
rf_classifier.fit(X_train,y_train)

In [66]:
ypred = rf_classifier.predict(X_test)

In [67]:
sklearn.metrics.accuracy_score(y_test,ypred)

0.6427328111401218

# SVM

In [77]:
sgdc = SGDClassifier(loss="hinge")

In [87]:
sc = StandardScaler()

In [88]:
X_training_data = sparse.hstack(((X_train2,sc.fit_transform(td3))))

In [89]:
from sklearn.svm import SVC

In [106]:
X_train, X_test, y_train, y_test = train_test_split(X_training_data,y,test_size=0.25,random_state=8)

In [92]:
from sklearn.svm import SVC

svc = SVC(kernel="rbf")

svc.fit(X_train,y_train)

In [93]:
svc.fit(X_train,y_train)

In [94]:
ypred = svc.predict(X_test)

In [95]:
sklearn.metrics.accuracy_score(y_test,ypred)

0.6552436901653612

In [231]:
svc.predict(X_test[1:])

array([1, 1, 1, ..., 1, 2, 1])

In [None]:
svc.predict_proba

In [230]:
X_test[1:]

<9191x27759 sparse matrix of type '<class 'numpy.float64'>'
	with 492679 stored elements in Compressed Sparse Row format>

In [None]:
# svc = SVC(kernel="rbf")

In [104]:
# cross_val_score(svc,X_train,y_train).mean()

# Xgboost

In [216]:
xgb_classifier = xgb.XGBClassifier(learning_rate=0.37,n_estimators=195)

In [217]:
xgb_classifier.fit(X_train,y_train)

Parameters: { "bootstrap" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [218]:
ypred = xgb_classifier.predict(X_test)

In [219]:
sklearn.metrics.accuracy_score(y_test,ypred)

0.667319408181027

# XGBoost Cross Validation

In [200]:
xgb_classifier = xgb.XGBClassifier(learning_rate=0.37,n_estimators=195)

In [201]:
cross_val_score(xgb_classifier,X_train,y_train,cv=10).mean()

0.6645631805966644

# XGBoost - Tunning

In [211]:

# learn_rate = np.arange(3/10,3/8,0.01)
n = np.arange(100,200,5)
train_score = np.zeros(len(n))
test_score = np.zeros(len(n))
# recall_score = np.zeros(len(learn_rate))
# precision_score = np.zeros(len(learn_rate))

In [212]:
# i = 0

for i,n in enumerate(n):
    xgb_classifier = xgb.XGBClassifier(learning_rate=0.37,n_estimators=n)
    xgb_classifier.fit(X_train,y_train)
    train_score[i] = xgb_classifier.score(X_train, y_train)
    test_score[i] = xgb_classifier.score(X_test, y_test)
    # recall_score[i] = sklearn.metrics.recall_score(y_test,xgb_classifier.predict(X_test))
    # precision_score[i] = sklearn.metrics.precision_score(y_test,xgb_classifier.predict(X_test))
    # i+=1

In [213]:
n = np.arange(100,200,5)

In [214]:
acc_results = np.matrix(np.c_[n
                             ,train_score
                             ,test_score
                             #,recall_score
                             #,precision_score
                             ])

models = pd.DataFrame(data=acc_results
            ,columns=["Estimators"
                     ,"Train ACC"
                     ,"Test ACC"
                     # ,"Recall Score"
                     # ,"Precision Score"
                     ])

In [215]:
models

Unnamed: 0,Estimators,Train ACC,Test ACC
0,100.0,0.795089,0.664926
1,105.0,0.797519,0.665796
2,110.0,0.801001,0.665035
3,115.0,0.803576,0.664164
4,120.0,0.805317,0.664817
5,125.0,0.808327,0.665252
6,130.0,0.810975,0.665144
7,135.0,0.813441,0.66547
8,140.0,0.816378,0.665035
9,145.0,0.818881,0.665579


In [140]:
for i, n in enumerate(models):
    print(i)
    print(models.iloc[i,1] - models.iloc[i + 1,1])

0
-0.0023936459580023994
1
-0.0008704167120009432
2
-0.004642222464004586


In [143]:
models.iloc[0 - 1,1]

0.7950893990497951

# XGBoost With GridSearchSV and RandomSearchSV

# LogisticRegressionCV

In [96]:
lgr_classifier = LogisticRegression(random_state=8)

In [97]:
lgr_classifier.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [99]:
ypred = lgr_classifier.predict(X_test)

In [100]:
sklearn.metrics.accuracy_score(y_test,ypred)

0.643059181897302