# Import Libraries

In [338]:
import pandas as pd
import numpy as np
import spacy
import re
import sklearn
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from scipy import sparse
from tqdm import tqdm

In [257]:
nlp = spacy.load('en_core_web_sm')

# Load Source Data

In [339]:
train_data = pd.read_csv("feedback-prize-effectiveness/train.csv")
test_data = pd.read_csv("feedback-prize-effectiveness/test.csv")

In [340]:
train_data = train_data.drop_duplicates(subset="discourse_text")
test_data = test_data.drop_duplicates(subset="discourse_text")

In [260]:
print(train_data.shape, test_data.shape)

(36691, 5) (10, 4)


In [261]:
train_data.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [262]:
test_data.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim


In [263]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36691 entries, 0 to 36764
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   discourse_id             36691 non-null  object
 1   essay_id                 36691 non-null  object
 2   discourse_text           36691 non-null  object
 3   discourse_type           36691 non-null  object
 4   discourse_effectiveness  36691 non-null  object
dtypes: object(5)
memory usage: 1.7+ MB


In [264]:
train_data.discourse_type.unique()

array(['Lead', 'Position', 'Claim', 'Evidence', 'Counterclaim',
       'Rebuttal', 'Concluding Statement'], dtype=object)

In [265]:
train_data.discourse_effectiveness.unique()

array(['Adequate', 'Ineffective', 'Effective'], dtype=object)

In [266]:
print("Count of Discourse Effectiveness")

train_data.discourse_effectiveness.value_counts()

Count of Discourse Effectiveness


Adequate       20912
Effective       9323
Ineffective     6456
Name: discourse_effectiveness, dtype: int64

In [267]:
print("Discourse Effectiveness Percent of Total")

train_data.discourse_effectiveness.value_counts() / train_data.shape[0]

Discourse Effectiveness Percent of Total


Adequate       0.569949
Effective      0.254095
Ineffective    0.175956
Name: discourse_effectiveness, dtype: float64

In [268]:
train_data.replace({"Ineffective":0,"Adequate":1,"Effective":2},inplace=True)

In [269]:
train_data.discourse_effectiveness.value_counts()

1    20912
2     9323
0     6456
Name: discourse_effectiveness, dtype: int64

In [270]:
train_data.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,1
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,1
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,1
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,1
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,1


# Load Spacy Pipeline

In [271]:
nlp = spacy.load("en_core_web_sm")

In [272]:
docs = [*nlp.pipe(train_data.discourse_text)]

# Functions

In [273]:
def pos_count(doc):
    nn_count = 0   #Noun
    pr_count = 0   #Pronoun
    vb_count = 0   #Verb
    adv_count = 0  #Adverb
    con_count = 0  #Conjuntion
    adj_count = 0  #Adjective
    intj_count = 0 #Interjection
    num_count = 0  #Numerics

    for token in doc:
        if token.pos_ == 'NOUN': #Noun
            nn_count += 1

        if token.pos_ == "PRON": #Pronoun
            pr_count += 1
        
        if token.pos_ == "CONJ": #Conjunction
            con_count += 1

        if token.pos_ == 'VERB': #Verb
            vb_count += 1

        if token.pos_ == 'ADV': #Adverb
            adv_count += 1
        
        if token.pos_ == 'ADJ': #Adjective
            adj_count += 1

        if token.pos_ == 'INTJ': #Interjection
            intj_count += 1

        if token.pos_ == 'NUM': #Numerics
            num_count += 1
    
    return pd.Series([nn_count, pr_count, vb_count, adv_count, con_count, adj_count, intj_count, num_count])

In [274]:
def text_features(df, col, col2):
    df[f"{col}_num_words"] = df[col].apply(lambda x: len(str(x).split()))
    df[f"{col}_num_unique_words"] = df[col].apply(lambda x: len(set(str(x).split())))
    df[f"{col}_num_chars"] = df[col].apply(lambda x: len(str(x)))
    df[f"{col}_num_stopwords"] = df[col].apply(lambda x: len([w for w in str(x).lower().split() if w in nlp.Defaults.stop_words]))
    df[f"{col}_num_words_upper"] = df[col].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
    df[f"{col}_num_words_title"] = df[col].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
    df[f"{col}_mean_word_len"] = df[col].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    df[f"{col}_num_paragraphs"] = df[col].apply(lambda x: len(x.split('\n')))
    df[[f'{col}_nn_count',f'{col}_pr_count',f'{col}_con_count',f'{col}_vb_count',f'{col}_adv_count',f'{col}_adj_count',f'{col}_intj_count',f'{col}_num_count']] = df[col2].apply(pos_count)
    return df

# Word Processing

In [275]:
train_data["discourse_text_docs"] = docs

In [276]:
le = LabelEncoder()

In [277]:
discourse_type_labels = le.fit_transform(train_data.discourse_type)

In [278]:
train_data['discourse_type'] = discourse_type_labels

In [279]:
train_data2 = text_features(train_data,"discourse_text","discourse_text_docs").copy()

In [280]:
cv = CountVectorizer()
# X = train_data2.drop(columns={'discourse_effectiveness'}).copy()
y = train_data2.discourse_effectiveness.copy()
X_train2 = cv.fit_transform(train_data2.discourse_text)

In [281]:
# dummy_variables = pd.get_dummies(train_data2.discourse_type)

In [282]:
# td3 = pd.concat([train_data2,dummy_variables], axis=1).fillna(0).iloc[:,6:]

In [310]:
numeric_data1 = train_data2.drop(columns=['discourse_id','essay_id','discourse_text','discourse_text_docs','discourse_effectiveness'])

In [311]:
sc = StandardScaler()

In [312]:
X_training_data = sparse.hstack((X_train2,sc.fit_transform(numeric_data1)))

In [313]:
X_train, X_test, y_train, y_test = train_test_split(X_training_data,y,test_size=0.25,random_state=8)

# Xgboost

In [314]:
xgb_classifier = xgb.XGBClassifier(learning_rate=0.37,n_estimators=195)

In [315]:
xgb_classifier.fit(X_train,y_train)

In [316]:
ypred = xgb_classifier.predict(X_test)

In [317]:
xgb_classifier.predict_proba(X_test)

array([[0.46570024, 0.43545577, 0.09884401],
       [0.24933694, 0.704238  , 0.04642501],
       [0.03881221, 0.37954134, 0.58164644],
       ...,
       [0.4561415 , 0.53508794, 0.00877058],
       [0.1949098 , 0.7918921 , 0.01319811],
       [0.08280249, 0.56150144, 0.35569605]], dtype=float32)

In [318]:
sklearn.metrics.accuracy_score(y_test,ypred)

0.6657582034230896

In [319]:
sklearn.metrics.confusion_matrix(y_test,ypred)

array([[ 314, 1281,   55],
       [ 208, 4617,  468],
       [  22, 1032, 1176]])

# Submition Data

In [341]:
nlp = spacy.load("en_core_web_sm")

docs = [*nlp.pipe(test_data.discourse_text)]

In [342]:
test_data["discourse_text_docs"] = docs

In [343]:
discourse_type_labels2 = le.transform(test_data.discourse_type)

In [344]:
test_data['discourse_type'] = discourse_type_labels2

In [345]:
test_data2 = text_features(test_data,"discourse_text","discourse_text_docs").copy()

In [346]:
X_test2 = cv.transform(test_data2.discourse_text)

In [347]:
numeric_data2 = test_data2.drop(columns=['discourse_id','essay_id','discourse_text','discourse_text_docs'])

In [348]:
X_testing_data = sparse.hstack((X_test2,sc.transform(numeric_data2)))

# XGBoost

In [349]:
ypred = xgb_classifier.predict(X_testing_data)

In [350]:
xgb_classifier.predict_proba(X_testing_data)

array([[0.01497783, 0.30297267, 0.6820495 ],
       [0.05102963, 0.6515042 , 0.2974661 ],
       [0.03951603, 0.57058847, 0.3898955 ],
       [0.15320879, 0.61788833, 0.22890289],
       [0.1306921 , 0.37635908, 0.49294883],
       [0.02120641, 0.34554982, 0.63324374],
       [0.00485903, 0.43368855, 0.56145245],
       [0.09355626, 0.5042503 , 0.40219343],
       [0.03141454, 0.12427477, 0.84431064],
       [0.00180178, 0.6580689 , 0.34012938]], dtype=float32)

In [351]:
xgb_classifier.predict(X_testing_data)

array([2, 1, 1, 1, 2, 2, 2, 1, 2, 1])