In [2]:
# !pip uninstall scipy
# !pip install scipy==1.10.1
import gensim
from gensim.utils import simple_preprocess

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from scipy.stats import levene
from scipy.stats import ttest_ind
from scipy.stats import f_oneway, kruskal
from scipy.stats import chi2_contingency
from scipy.stats import skew, kurtosis
from statsmodels.graphics.gofplots import qqplot


import re
import string
import emoji
from bs4 import BeautifulSoup
from textblob import TextBlob

import nltk
nltk.download('punkt')
nltk.download('words')
nltk.download('stopwords')


from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.porter import PorterStemmer
from collections import Counter
from wordcloud import WordCloud
import distance
from fuzzywuzzy import fuzz


import time
from tqdm.notebook import tqdm
tqdm.pandas()


from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.preprocessing import StandardScaler


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE


import xgboost as xgb
import lightgbm as lgbv

[nltk_data] Downloading package punkt to /Users/admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /Users/admin/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **Loading Dataset**

In [4]:
df_non_nlp = pd.read_csv('./data/non_nlp_features/df_non_nlp_feats1.csv')
df_nlp = pd.read_csv('./data/nlp_features/df_basic_nlp_feats1.csv')

In [5]:
df_nlp_text = df_nlp[['id', 'project_is_approved', 'project_title_cln', 'proj_essay1_cln', 
                      'proj_essay2_cln', 'project_resource_summary_cln']].copy()
df_nlp_text.head()

Unnamed: 0,id,project_is_approved,project_title_cln,proj_essay1_cln,proj_essay2_cln,project_resource_summary_cln
0,p253737,0,educ support english learner home,student english learner work english second th...,limit languag limit worldludwig wittgenstein e...,student need opportun practic begin read skill...
1,p258326,1,want projector hungri learner,student arriv school eager learn polit gener s...,projector need school crucial academ improv st...,student need projector help view educ program
2,p182444,0,soccer equip awesom middl school student,true champion arent alway one win gut mia hamm...,student campu come school know face uphil batt...,student need shine guard athlet sock soccer ba...
3,p246581,1,techi kindergarten,work uniqu school fill esl english second lang...,student live high poverti condit limit access ...,student need engag read math way inspir mini ipad
4,p104768,1,interact math tool,second grade classroom next year made around 2...,mani student math subject pertain life subject...,student need hand practic mathemat fun person ...


In [6]:
df_nlp_text.isna().sum()

id                               0
project_is_approved              0
project_title_cln               44
proj_essay1_cln                  0
proj_essay2_cln                  0
project_resource_summary_cln     0
dtype: int64

In [7]:
df_nlp_text.fillna('', inplace=True)
df_nlp_text.isna().sum()

id                              0
project_is_approved             0
project_title_cln               0
proj_essay1_cln                 0
proj_essay2_cln                 0
project_resource_summary_cln    0
dtype: int64

# **Generating Word2Vec: Vectors for Each word**

In [8]:
df_nlp_text['project_title_cln'].iloc[0]

'educ support english learner home'

In [9]:
simple_preprocess(df_nlp_text['project_title_cln'].iloc[0])

['educ', 'support', 'english', 'learner', 'home']

In [10]:
def word2vec_gen_story(df, col):
    story = []
    for lst in df[col].apply(simple_preprocess):
        story.append(lst)
    
    return story

def word2vec_train(model, story):
    
    start_time = time.time()
    model.build_vocab(story)
    model.train(story, total_examples=model.corpus_count, epochs=model.epochs)
    end_time = time.time()
    
    print(end_time-start_time)
    return model

In [11]:
story_proj_title = word2vec_gen_story(df_nlp_text, 'project_title_cln')
story_proj_ess1 = word2vec_gen_story(df_nlp_text, 'proj_essay1_cln')
story_proj_ess2 = word2vec_gen_story(df_nlp_text, 'proj_essay2_cln')
story_proj_res = word2vec_gen_story(df_nlp_text, 'project_resource_summary_cln')


word2vec_model1 = gensim.models.Word2Vec(window=5, min_count=2, workers=8, vector_size=100)
word2vec_model2 = gensim.models.Word2Vec(window=10, min_count=2, workers=8, vector_size=100)
word2vec_model3 = gensim.models.Word2Vec(window=10, min_count=2, workers=8, vector_size=100)
word2vec_model4 = gensim.models.Word2Vec(window=10, min_count=2, workers=8, vector_size=100)


word2vec_model1 = word2vec_train(word2vec_model1, story_proj_title)
word2vec_model2 = word2vec_train(word2vec_model2, story_proj_ess1)
word2vec_model3 = word2vec_train(word2vec_model3, story_proj_ess2)
word2vec_model4 = word2vec_train(word2vec_model4, story_proj_res)

0.587568998336792
6.398947238922119
8.308640003204346
1.4444620609283447


In [12]:
print('Vocabulary length of proj_title:', len(word2vec_model1.wv.index_to_key))
print('Vocabulary length of proj_ess1:', len(word2vec_model2.wv.index_to_key))
print('Vocabulary length of proj_ess2:', len(word2vec_model3.wv.index_to_key))
print('Vocabulary length of proj_res_sum:', len(word2vec_model4.wv.index_to_key))

Vocabulary length of proj_title: 6618
Vocabulary length of proj_ess1: 17003
Vocabulary length of proj_ess2: 20761
Vocabulary length of proj_res_sum: 9380


In [13]:
# shape of word vector of any particular word
word2vec_model1.wv['read'].shape

(100,)

In [14]:
# View the vector representation of the vocabulary of any textual feature
view_model = word2vec_model1
df_vocab_view_model = pd.DataFrame(view_model.wv.get_normed_vectors(), index=view_model.wv.index_to_key)
print('Shape:', df_vocab_view_model. shape)
df_vocab_view_model

Shape: (6618, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
learn,0.045290,0.042670,0.166546,-0.000281,0.022548,-0.184124,0.019320,0.198408,-0.166268,-0.036648,...,0.082366,-0.062818,0.009860,0.055437,0.181060,0.169323,-0.120646,-0.034877,-0.097515,-0.014989
read,-0.041323,-0.063651,-0.003682,0.156491,0.086488,-0.139032,0.052133,0.123246,-0.065392,-0.113360,...,0.104329,-0.005957,-0.078074,0.057200,0.046128,0.040911,0.142084,-0.002652,-0.204565,0.037798
need,-0.144410,0.188490,-0.065017,0.042579,-0.093902,-0.122287,-0.070589,0.051658,-0.097895,-0.220864,...,-0.034698,0.156480,0.124861,-0.009381,0.118072,0.181038,0.032861,-0.064345,0.061937,-0.075793
student,-0.147908,0.187027,0.014285,-0.033588,0.117591,-0.039632,-0.099160,0.126854,-0.098300,-0.160967,...,-0.128923,-0.035847,0.125920,-0.048118,0.164710,-0.003019,0.187531,-0.026646,0.077263,-0.067336
technolog,0.005644,0.206096,0.023511,-0.121799,0.122854,-0.200058,0.090525,0.078380,0.006332,-0.130649,...,0.103929,-0.132253,0.074525,0.103694,0.091108,0.082438,0.032840,0.093863,-0.134933,0.008771
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
holey,-0.090915,0.181997,0.023367,0.108680,0.010764,-0.188472,0.141062,0.257167,-0.097581,-0.059168,...,0.126518,0.050244,-0.024046,-0.000385,0.248989,0.078041,0.134878,-0.020182,0.069345,-0.015245
walkamoli,-0.062397,0.203894,0.008008,0.140174,-0.011977,-0.199298,0.093077,0.229559,-0.038019,-0.088947,...,0.113836,-0.006813,0.053529,0.031089,0.223817,0.168760,0.142827,-0.025339,0.036352,-0.101561
muse,-0.101493,0.082529,-0.020541,0.130691,0.017734,-0.167331,0.120347,0.212954,-0.058600,-0.040397,...,0.181678,0.016709,-0.004712,0.062024,0.203908,0.089305,0.136044,-0.181744,0.099551,-0.067937
geoboard,0.021209,0.066078,-0.016095,-0.065352,-0.004728,-0.242117,0.073599,0.293602,-0.024526,-0.032971,...,0.092540,-0.030949,0.007858,0.011019,0.269781,0.136819,0.166038,-0.188283,0.118046,-0.049586


In [15]:
word2vec_model1.wv.most_similar('read')

[('fluenci', 0.7221347689628601),
 ('comprehens', 0.7146868109703064),
 ('nonfict', 0.7111363410949707),
 ('aloud', 0.6790169477462769),
 ('foster', 0.6778611540794373),
 ('instil', 0.6752875447273254),
 ('rekindl', 0.6741672158241272),
 ('selfselect', 0.6712664365768433),
 ('nook', 0.6699384450912476),
 ('cd', 0.6659601926803589)]

# **Generating Average Word2Vec: Vectors for each document**

In [16]:
def word2vec_gen_docvec(text, model):
    
    # wordvec_dim = model.wv[model.wv.index_to_key[0]].shape[0]
    # avg_vec = np.zeros(wordvec_dim)
    # num_of_wrds = 0
    # # print(wordvec_dim, num_of_wrds)
    # word_lst = simple_preprocess(text)
    # for wrd in word_lst:
    #     if wrd in model.wv.key_to_index:
    #         num_of_wrds += 1
    #         avg_vec += model.wv[wrd]
    # avg_vec = avg_vec/num_of_wrds
    
    
    wordvec_dim = model.wv[model.wv.index_to_key[0]].shape[0]
    avg_vec = np.zeros(wordvec_dim)
    # This check is to account for words for whom vector is not created (min_count=2)
    doc = [wrd for wrd in simple_preprocess(text) if wrd in model.wv.key_to_index]
    # print(doc)
    if len(doc)>0:
        avg_vec = np.mean(model.wv[doc], axis=0)
    return avg_vec


test_doc = df_nlp_text['proj_essay1_cln'].iloc[5]
print(test_doc)
print('-'*80)
word2vec_gen_docvec(test_doc, word2vec_model2)

move 2nd grade 3rd grade begin next school year take current student move teach inclus classroom includ student adhd sld well autist student student work hard achiev goal matter struggl may school teach hous great deal autist student well ell student student love read work challeng also love move around work better abl move room differ area rather usual set
--------------------------------------------------------------------------------


array([ 0.11494264,  0.3799368 , -0.00770212, -0.2100841 , -0.6650044 ,
       -0.48449647, -0.34660646,  0.49904191,  0.17497565,  0.09198917,
       -0.52872837,  1.0793037 , -0.36428213, -1.1069055 , -0.1424415 ,
        1.0061094 , -0.47118583,  0.9869686 , -0.23240829, -0.53228736,
       -0.23206952, -0.17435393, -0.00267876,  0.10386855,  0.24099359,
        0.26176694, -0.32554257, -0.23715177,  0.8111061 ,  0.25708747,
       -0.42449963,  0.70317143, -0.09232473,  0.70687   ,  0.41302806,
       -1.0874805 , -0.24322167,  0.24349521, -0.30441013,  0.13283093,
        0.38327554,  0.09884408,  0.22680204,  0.37058067,  0.62848693,
       -0.79873216,  0.5925496 ,  0.2071998 , -0.41693527, -0.24388504,
       -1.2168086 ,  0.74037486,  0.07998767, -0.03032509, -0.34628862,
        0.49609444, -0.41647214,  0.04860004,  0.09098256,  0.19382945,
       -0.12265771, -0.10411967, -0.03322358,  0.72808766,  0.7307038 ,
       -0.38975522,  0.3463566 ,  0.26389763,  0.6933319 , -0.28

In [17]:
def word2vec_gen_docvec_arr(df, col, model):
    arr = []
    start_time=time.time()
    for doc in df[col].values:
        arr.append(word2vec_gen_docvec(doc, model))
    end_time=time.time()
    print(end_time-start_time)
                            
    return np.array(arr)

docvec_arr_proj_title = word2vec_gen_docvec_arr(df_nlp_text, 'project_title_cln', word2vec_model1)
docvec_arr_proj_ess1 = word2vec_gen_docvec_arr(df_nlp_text, 'proj_essay1_cln', word2vec_model2)
docvec_arr_proj_ess2 = word2vec_gen_docvec_arr(df_nlp_text, 'proj_essay2_cln', word2vec_model3)
docvec_arr_proj_res = word2vec_gen_docvec_arr(df_nlp_text, 'project_resource_summary_cln', word2vec_model4)

print(docvec_arr_proj_title.shape)
print(docvec_arr_proj_ess1.shape)
print(docvec_arr_proj_ess2.shape)
print(docvec_arr_proj_res.shape)

1.5562009811401367
9.7330482006073
11.846950054168701
2.773733139038086
(109245, 100)
(109245, 100)
(109245, 100)
(109245, 100)


In [18]:
def word2vec_gen_docvec_df(arr, prefix, flag=0):
    df = pd.DataFrame(arr)
    df.columns = [f'w2v_{prefix}_{col}' for col in df.columns]
    if flag==1:
        df[f'{prefix}_skew'] = skew(arr, axis=1)
        df[f'{prefix}_kurt'] = kurtosis(arr, axis=1)
                            
    return df


df_arr_proj_title = word2vec_gen_docvec_df(docvec_arr_proj_title, 'title', 0)
df_arr_proj_ess1 = word2vec_gen_docvec_df(docvec_arr_proj_ess1, 'ess1', 0)
df_arr_proj_ess2 = word2vec_gen_docvec_df(docvec_arr_proj_ess2, 'ess2', 0)
df_arr_proj_res = word2vec_gen_docvec_df(docvec_arr_proj_res, 'res_sum', 0)


df_arr_proj_title_sk = word2vec_gen_docvec_df(docvec_arr_proj_title, 'title', 1)
df_arr_proj_ess1_sk = word2vec_gen_docvec_df(docvec_arr_proj_ess1, 'ess1', 1)
df_arr_proj_ess2_sk = word2vec_gen_docvec_df(docvec_arr_proj_ess2, 'ess2', 1)
df_arr_proj_res_sk = word2vec_gen_docvec_df(docvec_arr_proj_res, 'res_sum', 1)

# **Stacking the document vectors together**

In [19]:
# print(np.hstack((docvec_arr_proj_title, docvec_arr_proj_ess1, docvec_arr_proj_ess2, docvec_arr_proj_res)).shape)
# pd.DataFrame(np.hstack((docvec_arr_proj_title, docvec_arr_proj_ess1, docvec_arr_proj_ess2, docvec_arr_proj_res)))

In [20]:
def merge_df_arr(df1, df2, df3, df4):
    df_int1 = pd.merge(df1, df2, left_index=True, right_index=True)
    df_int2 = pd.merge(df_int1, df3, left_index=True, right_index=True)
    df_merged = pd.merge(df_int2, df4, left_index=True, right_index=True)
    return df_merged

df_arr_merged_text_feat = merge_df_arr(df_arr_proj_title, df_arr_proj_ess1, df_arr_proj_ess2, df_arr_proj_res)
print(df_arr_merged_text_feat.shape)

df_arr_merged_text_feat_sk = merge_df_arr(df_arr_proj_title_sk, df_arr_proj_ess1_sk, df_arr_proj_ess2_sk, df_arr_proj_res_sk)
print(df_arr_merged_text_feat_sk.shape)

(109245, 400)
(109245, 408)


In [21]:
# Check missing values in df_arr_merged_text_feat
df_arr_merged_text_feat.isna().sum().loc[df_arr_merged_text_feat.isna().sum()>0]

Series([], dtype: int64)

In [22]:
# Check missing values in df_arr_merged_text_feat_sk
print(df_arr_merged_text_feat_sk.isna().sum().loc[df_arr_merged_text_feat_sk.isna().sum()>0])
print('-'*50)
df_arr_merged_text_feat_sk.fillna(0, inplace=True)
print(df_arr_merged_text_feat_sk.isna().sum().loc[df_arr_merged_text_feat_sk.isna().sum()>0])

title_skew    166
title_kurt    166
dtype: int64
--------------------------------------------------
Series([], dtype: int64)


# **Compiling and Saving Datasets**

In [23]:
df_nlp_wordvec = pd.merge(df_arr_merged_text_feat, df_nlp_text['id'], left_index=True, right_index=True)
print(df_nlp_wordvec.shape)
print('-'*50)
df_nlp_wordvec.head(3)

(109245, 401)
--------------------------------------------------


Unnamed: 0,w2v_title_0,w2v_title_1,w2v_title_2,w2v_title_3,w2v_title_4,w2v_title_5,w2v_title_6,w2v_title_7,w2v_title_8,w2v_title_9,...,w2v_res_sum_91,w2v_res_sum_92,w2v_res_sum_93,w2v_res_sum_94,w2v_res_sum_95,w2v_res_sum_96,w2v_res_sum_97,w2v_res_sum_98,w2v_res_sum_99,id
0,-0.095794,0.821023,0.165957,-0.2324,-0.160335,-0.846026,-0.254201,0.670786,-0.614181,-0.65563,...,-0.053628,0.299027,0.331118,-0.570799,0.159491,0.692381,0.20268,0.707922,1.350626,p253737
1,-0.052652,0.569649,-0.085386,0.076098,-0.024387,-0.669934,-0.059968,0.702918,-0.503515,-0.439409,...,-0.057715,0.101178,0.147903,-0.008279,0.291279,0.321519,-0.248444,0.665515,1.03995,p258326
2,-0.327893,0.970274,-0.169125,0.468727,-0.33318,-0.553257,0.178566,0.780312,-0.051462,-0.172929,...,0.651809,0.007812,0.808386,1.211543,0.647845,-0.257385,-0.555258,0.343186,0.540692,p182444


In [24]:
df_nlp_wordvec_skew_kurt = pd.merge(df_arr_merged_text_feat_sk, df_nlp_text['id'], left_index=True, right_index=True)
print(df_nlp_wordvec_skew_kurt.shape)
print('-'*50)
df_nlp_wordvec_skew_kurt.head(3)

(109245, 409)
--------------------------------------------------


Unnamed: 0,w2v_title_0,w2v_title_1,w2v_title_2,w2v_title_3,w2v_title_4,w2v_title_5,w2v_title_6,w2v_title_7,w2v_title_8,w2v_title_9,...,w2v_res_sum_93,w2v_res_sum_94,w2v_res_sum_95,w2v_res_sum_96,w2v_res_sum_97,w2v_res_sum_98,w2v_res_sum_99,res_sum_skew,res_sum_kurt,id
0,-0.095794,0.821023,0.165957,-0.2324,-0.160335,-0.846026,-0.254201,0.670786,-0.614181,-0.65563,...,0.331118,-0.570799,0.159491,0.692381,0.20268,0.707922,1.350626,0.101466,-0.567463,p253737
1,-0.052652,0.569649,-0.085386,0.076098,-0.024387,-0.669934,-0.059968,0.702918,-0.503515,-0.439409,...,0.147903,-0.008279,0.291279,0.321519,-0.248444,0.665515,1.03995,0.16182,0.68504,p258326
2,-0.327893,0.970274,-0.169125,0.468727,-0.33318,-0.553257,0.178566,0.780312,-0.051462,-0.172929,...,0.808386,1.211543,0.647845,-0.257385,-0.555258,0.343186,0.540692,-0.128192,0.055842,p182444


In [25]:
# df_nlp_wordvec.to_csv(path_or_buf='./data/nlp_features/df_nlp_wordvec.csv', sep=',', index=False)
# df_nlp_wordvec_skew_kurt.to_csv(path_or_buf='./data/nlp_features/df_nlp_wordvec_skew_kurt.csv', sep=',', index=False)