# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import spacy
import re
import sklearn
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from scipy import sparse
from tqdm import tqdm

In [2]:
nlp = spacy.load('en_core_web_sm')

# Load Source Data

In [3]:
train_data = pd.read_csv('../input/feedback-prize-effectiveness/train.csv')
test_data = pd.read_csv('../input/feedback-prize-effectiveness/test.csv')
sub = pd.read_csv("../input/feedback-prize-effectiveness/sample_submission.csv")

In [4]:
train_data = train_data.drop_duplicates(subset="discourse_text")
test_data = test_data.drop_duplicates(subset="discourse_text")

In [5]:
print(train_data.shape, test_data.shape)

(36691, 5) (10, 4)


In [6]:
train_data.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [7]:
test_data.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim


In [8]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36691 entries, 0 to 36764
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   discourse_id             36691 non-null  object
 1   essay_id                 36691 non-null  object
 2   discourse_text           36691 non-null  object
 3   discourse_type           36691 non-null  object
 4   discourse_effectiveness  36691 non-null  object
dtypes: object(5)
memory usage: 1.7+ MB


In [9]:
train_data.discourse_type.unique()

array(['Lead', 'Position', 'Claim', 'Evidence', 'Counterclaim',
       'Rebuttal', 'Concluding Statement'], dtype=object)

In [10]:
train_data.discourse_effectiveness.unique()

array(['Adequate', 'Ineffective', 'Effective'], dtype=object)

In [11]:
print("Count of Discourse Effectiveness")

train_data.discourse_effectiveness.value_counts()

Count of Discourse Effectiveness


Adequate       20912
Effective       9323
Ineffective     6456
Name: discourse_effectiveness, dtype: int64

In [12]:
print("Discourse Effectiveness Percent of Total")

train_data.discourse_effectiveness.value_counts() / train_data.shape[0]

Discourse Effectiveness Percent of Total


Adequate       0.569949
Effective      0.254095
Ineffective    0.175956
Name: discourse_effectiveness, dtype: float64

In [13]:
train_data.replace({"Ineffective":0,"Adequate":1,"Effective":2},inplace=True)

In [14]:
train_data.discourse_effectiveness.value_counts()

1    20912
2     9323
0     6456
Name: discourse_effectiveness, dtype: int64

In [15]:
train_data.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,1
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,1
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,1
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,1
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,1


# Load Spacy Pipeline

In [16]:
nlp = spacy.load("en_core_web_sm")

In [17]:
docs = [*nlp.pipe(train_data.discourse_text)]

# Functions

In [18]:
def pos_count(doc):
    nn_count = 0   #Noun
    pr_count = 0   #Pronoun
    vb_count = 0   #Verb
    adv_count = 0  #Adverb
    con_count = 0  #Conjuntion
    adj_count = 0  #Adjective
    intj_count = 0 #Interjection
    num_count = 0  #Numerics

    for token in doc:
        if token.pos_ == 'NOUN': #Noun
            nn_count += 1

        if token.pos_ == "PRON": #Pronoun
            pr_count += 1
        
        if token.pos_ == "CONJ": #Conjunction
            con_count += 1

        if token.pos_ == 'VERB': #Verb
            vb_count += 1

        if token.pos_ == 'ADV': #Adverb
            adv_count += 1
        
        if token.pos_ == 'ADJ': #Adjective
            adj_count += 1

        if token.pos_ == 'INTJ': #Interjection
            intj_count += 1

        if token.pos_ == 'NUM': #Numerics
            num_count += 1
    
    return pd.Series([nn_count, pr_count, vb_count, adv_count, con_count, adj_count, intj_count, num_count])

In [19]:
def text_features(df, col, col2):
    df[f"{col}_num_words"] = df[col].apply(lambda x: len(str(x).split()))
    df[f"{col}_num_unique_words"] = df[col].apply(lambda x: len(set(str(x).split())))
    df[f"{col}_num_chars"] = df[col].apply(lambda x: len(str(x)))
    df[f"{col}_num_stopwords"] = df[col].apply(lambda x: len([w for w in str(x).lower().split() if w in nlp.Defaults.stop_words]))
    df[f"{col}_num_words_upper"] = df[col].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
    df[f"{col}_num_words_title"] = df[col].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
    df[f"{col}_mean_word_len"] = df[col].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    df[f"{col}_num_paragraphs"] = df[col].apply(lambda x: len(x.split('\n')))
    df[[f'{col}_nn_count',f'{col}_pr_count',f'{col}_con_count',f'{col}_vb_count',f'{col}_adv_count',f'{col}_adj_count',f'{col}_intj_count',f'{col}_num_count']] = df[col2].apply(pos_count)
    return df

# Word Processing

In [20]:
train_data["discourse_text_docs"] = docs

In [21]:
le = LabelEncoder()

In [22]:
discourse_type_labels = le.fit_transform(train_data.discourse_type)

In [23]:
train_data['discourse_type'] = discourse_type_labels

In [24]:
train_data2 = text_features(train_data,"discourse_text","discourse_text_docs").copy()

In [25]:
train_data2["discourse_effectiveness"].replace("Effective", value = 2, inplace = True)
train_data2["discourse_effectiveness"].replace("Adequate", value = 1, inplace = True)
train_data2["discourse_effectiveness"].replace("Ineffective", value = 0, inplace = True)

In [26]:
cv = CountVectorizer()
y = train_data2.discourse_effectiveness.copy()
X_train2 = cv.fit_transform(train_data2.discourse_text)

In [27]:
numeric_data1 = train_data2.drop(columns=['discourse_id','essay_id','discourse_text','discourse_text_docs','discourse_effectiveness'])

In [28]:
sc = StandardScaler()

In [29]:
X_training_data = sparse.hstack((X_train2,sc.fit_transform(numeric_data1)))

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_training_data,y,test_size=0.25,random_state=8)

# Xgboost

In [31]:
xgb_classifier = xgb.XGBClassifier(learning_rate=0.37,n_estimators=195)

In [32]:
xgb_classifier.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.37, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=195,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0, ...)

In [33]:
ypred = xgb_classifier.predict(X_test)

In [34]:
xgb_classifier.predict_proba(X_test)

array([[0.8478075 , 0.11659585, 0.03559665],
       [0.20321141, 0.75441337, 0.04237525],
       [0.05199059, 0.51025623, 0.43775323],
       ...,
       [0.30913314, 0.6779621 , 0.01290474],
       [0.11944469, 0.8574001 , 0.02315524],
       [0.08175102, 0.70432967, 0.21391934]], dtype=float32)

In [35]:
sklearn.metrics.accuracy_score(y_test,ypred)

0.6648860787092554

In [36]:
# sklearn.metrics.confusion_matrix(y_test,ypred)

# Submition Data

In [37]:
nlp = spacy.load("en_core_web_sm")

docs = [*nlp.pipe(test_data.discourse_text)]

In [38]:
test_data["discourse_text_docs"] = docs

In [39]:
discourse_type_labels2 = le.transform(test_data.discourse_type)

In [40]:
test_data['discourse_type'] = discourse_type_labels2

In [41]:
test_data2 = text_features(test_data,"discourse_text","discourse_text_docs").copy()

In [42]:
X_test2 = cv.transform(test_data2.discourse_text)

In [43]:
numeric_data2 = test_data2.drop(columns=['discourse_id','essay_id','discourse_text','discourse_text_docs'])

In [44]:
X_testing_data = sparse.hstack((X_test2,sc.transform(numeric_data2)))

# XGBoost

In [45]:
xgb_classifier.fit(X_training_data,y)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.37, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=195,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0, ...)

In [46]:
ypred = xgb_classifier.predict(X_testing_data)

In [47]:
test_predict = xgb_classifier.predict_proba(X_testing_data)

In [48]:
test_predict

array([[0.00366892, 0.26708975, 0.7292413 ],
       [0.04808914, 0.7676333 , 0.18427749],
       [0.04202449, 0.57078   , 0.3871956 ],
       [0.1253755 , 0.610134  , 0.26449046],
       [0.07194986, 0.54216486, 0.3858853 ],
       [0.01828533, 0.16783999, 0.81387466],
       [0.01587587, 0.5282219 , 0.4559022 ],
       [0.07472366, 0.6226058 , 0.30267054],
       [0.03932483, 0.39316806, 0.5675071 ],
       [0.00397196, 0.27527353, 0.7207545 ]], dtype=float32)

In [49]:
xgb_classifier.predict(X_testing_data)

array([2, 1, 1, 1, 1, 2, 1, 1, 2, 2])

In [50]:
test_data

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_text_docs,discourse_text_num_words,discourse_text_num_unique_words,discourse_text_num_chars,discourse_text_num_stopwords,discourse_text_num_words_upper,...,discourse_text_mean_word_len,discourse_text_num_paragraphs,discourse_text_nn_count,discourse_text_pr_count,discourse_text_con_count,discourse_text_vb_count,discourse_text_adv_count,discourse_text_adj_count,discourse_text_intj_count,discourse_text_num_count
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,4,"(Making, choices, in, life, can, be, very, dif...",50,40,274,29,0,...,4.48,1,11,5,8,6,0,4,0,1
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,5,"(Seeking, multiple, opinions, can, help, a, pe...",11,10,65,4,0,...,4.909091,1,3,0,4,0,0,2,0,0
2,9790d835736b,D72CB1C11673,it can decrease stress levels,0,"(it, can, decrease, stress, levels)",5,5,30,2,0,...,5.0,1,2,1,2,0,0,0,0,0
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,0,"(a, great, chance, to, learn, something, new)",7,7,38,3,0,...,4.428571,1,1,1,1,0,0,2,0,0
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,0,"(can, be, very, helpful, and, beneficial, .)",6,6,36,4,0,...,5.0,1,0,0,1,1,0,2,0,0
5,2e214524dbe3,D72CB1C11673,When making a decision there is a chance that ...,3,"(When, making, a, decision, there, is, a, chan...",74,57,428,44,0,...,4.783784,1,14,5,18,4,0,9,0,0
6,84812fc2ab9f,D72CB1C11673,Everyone is different and may have more experi...,3,"(Everyone, is, different, and, may, have, more...",82,63,475,46,0,...,4.792683,1,19,9,16,3,0,10,0,0
7,c668ff840720,D72CB1C11673,Seeking others opinion can be very helpful and...,0,"(Seeking, others, opinion, can, be, very, help...",9,9,59,5,0,...,5.555556,1,2,0,2,1,0,2,0,0
8,739a6d00f44a,D72CB1C11673,Taking other peoples advice and doing what the...,3,"(Taking, other, peoples, advice, and, doing, w...",88,61,458,56,0,...,4.204545,1,13,14,16,7,0,10,0,0
9,bcfae2c9a244,D72CB1C11673,You can learn from others experiences by seeki...,1,"(You, can, learn, from, others, experiences, b...",57,50,356,27,0,...,5.22807,1,16,6,15,4,0,3,0,0


In [None]:
sub['Ineffective'] = test_predict[:,0]
sub['Adequate'] = test_predict[:,1]
sub['Effective'] = test_predict[:,2]
sub

In [51]:
"""
submit_data_dict = {'discourse_id':test_data.discourse_id
              ,'Ineffective':test_predict[:,0]
              ,'Adequate':test_predict[:,1]
              ,'Effective':test_predict[:,2]}
"""

In [52]:
# sub = np.around(pd.DataFrame(sub),2)

In [53]:
sub.to_csv('submission.csv',index=False)