# Import Libraries

In [358]:
import pandas as pd
import numpy as np
import spacy
import re
import sklearn
from sklearn.model_selection import train_test_split
from collections import Counter
from spacytextblob.spacytextblob import SpacyTextBlob
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse

In [327]:
class ClassificationSummary:
    
    def __init__(self, y_true: np.array ,y_pred: np.array):
        
        self.__y_true: array = y_true
        self.__y_pred: array = y_pred
        self.__confusion_matrix: np.ndarray
        self.__sensitivity: float
        self.__specificity: float
        self.__accuracy: float
        self.missclassification_error: float
        self.__recall_score: float
        self.__precision_score: float
        self.__summary_data()
    
    @property
    def y_true(self):
        return self.__y_true
    
    @property
    def y_pred(self):
        return self.__y_pred
    
    @property
    def confusion_matrix(self):
        return self.__confusion_matrix
    
    @property
    def sensitivity(self):
        return self.__sensitivity
    
    @property
    def specificity(self):
        return self.__specificity
    
    @property
    def accuracy(self):
        return self.__accuracy

    @property
    def recall_score(self):
        return self.__recall_score
    
    @property
    def precision_score(self):
        return self.__precision_score
    
    @property
    def missclassification_error(self):
        return self.__missclassification_error
    
    def __summary_data(self):
        self.__confusion_matrix = sklearn.metrics.confusion_matrix(self.__y_true, self.__y_pred)
        tn, fp, fn, tp = self.__confusion_matrix.ravel()
        self.__sensitivity: float = tp / (tp + fn)
        self.__specificity: float = tn / (tn + fp)
        self.__accuracy: float = (tn + tp) / (tn + fp + fn + tp)
        self.__missclassification_error: float = 1 - self.__accuracy
        self.__recall_score = sklearn.metrics.recall_score(self.__y_true, self.__y_pred)
        self.__precision_score = sklearn.metrics.precision_score(self.__y_true, self.__y_pred)
            
    def __repr__(self):

        r2 = lambda x: round(x * 100,2)
        
        return(
                "Confusion Matrix:\n{}".format(self.__confusion_matrix)
               +"\n"
               +"Accuracy: {}%".format(r2(self.__accuracy))
               +"\n"
               +"Missclassification Error: {}%".format(r2(self.__missclassification_error))
               +"\n"
               +"Sensitivity: {}%".format(r2(self.__sensitivity))
               +"\n"
               +"Specificity: {}%".format(r2(self.__specificity))
               +"\n"
               +"Recall Score: {}%".format(r2(self.__recall_score))
               +"\n"
               +"Precision Score: {}%".format(r2(self.__precision_score))
              )

In [270]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')

<spacytextblob.spacytextblob.SpacyTextBlob at 0x7fac41010550>

# Load Source Data

In [271]:
train_data = pd.read_csv("feedback-prize-effectiveness/train.csv")
test_data = pd.read_csv("feedback-prize-effectiveness/test.csv")

In [272]:
print(train_data.shape, test_data.shape)

(36765, 5) (10, 4)


In [273]:
train_data.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [274]:
test_data.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim


In [275]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36765 entries, 0 to 36764
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   discourse_id             36765 non-null  object
 1   essay_id                 36765 non-null  object
 2   discourse_text           36765 non-null  object
 3   discourse_type           36765 non-null  object
 4   discourse_effectiveness  36765 non-null  object
dtypes: object(5)
memory usage: 1.4+ MB


In [276]:
train_data.discourse_type.unique()

array(['Lead', 'Position', 'Claim', 'Evidence', 'Counterclaim',
       'Rebuttal', 'Concluding Statement'], dtype=object)

In [277]:
train_data.discourse_effectiveness.unique()

array(['Adequate', 'Ineffective', 'Effective'], dtype=object)

##### TODO: Build a cross table

In [278]:
print("Count of Discourse Effectiveness")

train_data.discourse_effectiveness.value_counts()

Count of Discourse Effectiveness


Adequate       20977
Effective       9326
Ineffective     6462
Name: discourse_effectiveness, dtype: int64

In [279]:
print("Discourse Effectiveness Percent of Total")

train_data.discourse_effectiveness.value_counts() / train_data.shape[0]

Discourse Effectiveness Percent of Total


Adequate       0.570570
Effective      0.253665
Ineffective    0.175765
Name: discourse_effectiveness, dtype: float64

In [280]:
train_data.replace({"Ineffective":0,"Adequate":1,"Effective":2},inplace=True)

In [281]:
train_data.discourse_effectiveness.value_counts()

1    20977
2     9326
0     6462
Name: discourse_effectiveness, dtype: int64

In [282]:
train_data.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,1
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,1
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,1
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,1
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,1


In [283]:
X_train = train_data.iloc[:,:-1].values
y_train = train_data.iloc[:,-1].values

In [284]:
print(f"Train Features Matrix: {X_train.shape}"
+ "\n"
+ f"Target:{y_train.shape}" +
"\n"
+ f"Test Feature Matrix: {test_data.shape}")

Train Features Matrix: (36765, 4)
Target:(36765,)
Test Feature Matrix: (10, 4)


In [285]:
nlp = spacy.load("en_core_web_sm")

In [286]:
docs = [*nlp.pipe(X_train[:,2])]

In [287]:
docs[0]

Hi, i'm Isaac, i'm going to be writing about how this face on Mars is a natural landform or if there is life on Mars that made it. The story is about how NASA took a picture of Mars and a face was seen on the planet. NASA doesn't know if the landform was created by life on Mars, or if it is just a natural landform. 

In [288]:
len(train_data["discourse_text"][0])

317

In [289]:
# Returns number of characters
len(docs[0].text)

317

In [290]:
# Returns number of words
len(docs[0])

76

In [291]:
for i in Counter(token.pos_ for token in docs[1]).keys():
    print(i,spacy.explain(i))

ADP adposition
PRON pronoun
NOUN noun
PUNCT punctuation
VERB verb
SCONJ subordinating conjunction
DET determiner
AUX auxiliary
ADJ adjective
PART particle
PROPN proper noun


In [292]:
def pos_count(doc):
    nn_count = 0   #Noun
    pr_count = 0   #Pronoun
    vb_count = 0   #Verb
    adv_count = 0  #Adverb
    con_count = 0  #Conjuntion
    adj_count = 0  #Adjective
    intj_count = 0   #Interjection
    num_count = 0   #Numerics
    
    # sent = nltk.word_tokenize(sent)
    # sent = nltk.pos_tag(sent)

    for token in doc:
        if token.pos_ == 'NOUN': #Noun
            nn_count += 1

        if token.pos_ == "PRON": #Pronoun
            pr_count += 1
        
        if token.pos_ == "CONJ": #Conjunction
            con_count += 1

        if token.pos_ == 'VERB': #Verb
            vb_count += 1

        if token.pos_ == 'ADV': #Adverb
            adv_count += 1
        
        if token.pos_ == 'ADJ': #Adjective
            adj_count += 1

        if token.pos_ == 'INTJ': #Interjection
            intj_count += 1

        if token.pos_ == 'NUM': #Numerics
            num_count += 1
    
    return pd.Series([nn_count, pr_count, vb_count, adv_count, con_count, adj_count, intj_count, num_count])

In [293]:
def text_features(df, col):
    df[f"{col}_num_words"] = df[col].apply(lambda x: len(str(x).split()))
    df[f"{col}_num_unique_words"] = df[col].apply(lambda x: len(set(str(x).split())))
    df[f"{col}_num_chars"] = df[col].apply(lambda x: len(str(x)))
    df[f"{col}_num_stopwords"] = df[col].apply(lambda x: len([w for w in str(x).lower().split() if w in nlp.Defaults.stop_words]))
    # df[f"{col}_num_punctuations"] = df[col].progress_apply(lambda x: len([c for c in str(x) if c in list(string.punctuation)]))
    df[f"{col}_num_words_upper"] = df[col].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
    df[f"{col}_num_words_title"] = df[col].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
    df[f"{col}_mean_word_len"] = df[col].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    # df[f"{col}_num_paragraphs"] = df[col].progress_apply(lambda x: len(x.split('\n')))
    # df[f"{col}_num_contractions"] = df[col].progress_apply(contraction_count)
    # df[f"{col}_polarity"] = df[col].progress_apply(lambda x: TextBlob(x).sentiment[0])
    # df[f"{col}_subjectivity"] = df[col].progress_apply(lambda x: TextBlob(x).sentiment[1])
    df[[f'{col}_nn_count',f'{col}_pr_count',f'{col}_con_count',f'{col}_vb_count',f'{col}_adv_count',f'{col}_adj_count',f'{col}_intj_count',f'{col}_num_count']] = df[col].apply(pos_count)
    return df

In [294]:
train_data["discourse_text_docs"] = docs

In [380]:
train_data2 = text_features(train_data,"discourse_text_docs").copy()

In [310]:
X = train_data2.drop(columns={'discourse_effectiveness'}).copy()
y = train_data2.discourse_effectiveness.copy()

In [350]:
cv = CountVectorizer()

X_train2 = cv.fit_transform(X.discourse_text)

X_train, X_test, y_train, y_test = train_test_split(X_train2,y,test_size=0.25,random_state=8)

mn_classifier = MultinomialNB(alpha=14)

mn_classifier.fit(X_train,y_train)

ypred = mn_classifier.predict(X_test)

# sklearn.metrics.confusion_matrix(y_test,ypred)

In [351]:
sklearn.metrics.accuracy_score(y_test,ypred)

0.6119451697127938

In [352]:
cv = CountVectorizer(stop_words='english')

X_train2 = cv.fit_transform(X.discourse_text)

X_train, X_test, y_train, y_test = train_test_split(X_train2,y,test_size=0.25,random_state=8)

mn_classifier = MultinomialNB(alpha=14.4)

mn_classifier.fit(X_train,y_train)

ypred = mn_classifier.predict(X_test)

# sklearn.metrics.confusion_matrix(y_test,ypred)

In [353]:
sklearn.metrics.accuracy_score(y_test,ypred)

0.608899042645779

In [364]:
dummy_variables = pd.get_dummies(train_data2.discourse_type)

In [379]:
train_data2.merge(dummy_variables,left_on=train_data2.index, right_on= dummy_variables.index).iloc[:,7:]

Unnamed: 0,discourse_text_docs_mean_word_len,discourse_text_docs_nn_count,discourse_text_docs_pr_count,discourse_text_docs_con_count,discourse_text_docs_vb_count,discourse_text_docs_adv_count,discourse_text_docs_adj_count,discourse_text_docs_intj_count,discourse_text_docs_num_count,Claim_x,...,Lead_x,Position_x,Rebuttal_x,Claim_y,Concluding Statement_y,Counterclaim_y,Evidence_y,Lead_y,Position_y,Rebuttal_y
0,3.731343,10,6,8,1,0,2,1,0,0,...,1,0,0,0,0,0,0,1,0,0
1,4.121951,6,7,5,0,0,4,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,4.000000,3,4,3,1,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0
3,4.027778,9,14,9,2,0,5,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,4.611111,4,2,4,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36760,5.000000,4,3,2,2,0,2,0,1,1,...,0,0,0,1,0,0,0,0,0,0
36761,5.333333,4,0,1,1,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0
36762,4.260870,4,5,3,0,0,3,0,0,0,...,0,1,0,0,0,0,0,0,1,0
36763,4.341463,12,15,20,1,0,6,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [389]:
X_training_data = sparse.hstack(((X_train2,train_data2.iloc[:,6:].values)))

In [390]:
X_train, X_test, y_train, y_test = train_test_split(X_training_data,y,test_size=0.25,random_state=8)

mn_classifier = MultinomialNB(alpha=14.4)

mn_classifier.fit(X_train,y_train)

ypred = mn_classifier.predict(X_test)

# sklearn.metrics.confusion_matrix(y_test,ypred)

In [391]:
sklearn.metrics.accuracy_score(y_test,ypred)

0.6071583986074848

In [392]:
from sklearn.ensemble import RandomForestClassifier

In [393]:
rf_classifier = RandomForestClassifier(n_estimators=300, criterion="entropy", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0, max_features="sqrt", max_leaf_nodes=None, min_impurity_decrease=0, bootstrap=True, oob_score=False, n_jobs=None, random_state=8, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0, max_samples=None)

In [394]:
rf_classifier.fit(X_train,y_train)

In [395]:
ypred = rf_classifier.predict(X_test)

In [396]:
sklearn.metrics.accuracy_score(y_test,ypred)

0.6455613577023499