In [36]:
import pandas as pd
import numpy as np
import sys
import os
import gensim
import spacy
import altair as alt
from hyphen import Hyphenator
from gensim.parsing.preprocessing import preprocess_string, STOPWORDS 
from gensim.parsing.preprocessing import strip_tags
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_numeric
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from sklearn.metrics import accuracy_score
import textstat
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
nltk.download('averaged_perceptron_tagger')
cwd = os.getcwd()


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\afhar\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
#import  data
data_dir = cwd + '\\data\\'
df_train_raw = pd.read_csv(data_dir+'WikiLarge_Train.csv')
df_test_raw = pd.read_csv(data_dir+'WikiLarge_Test.csv')
df_train_raw.sample(10)

Unnamed: 0,original_text,label
230059,"Dixon sang bass for The Jubilee Singers , a lo...",0
190204,The majority view today is that Mark is the fi...,1
167093,Early life Barry George was born in Hammersmit...,1
397703,The major interests of the AIEE were wire comm...,0
222293,The book was the first systematic discussion o...,0
7637,New York City Police Museum site Accessed Janu...,1
230498,"For the Egyptian Book of the Dead , click here .",0
137610,A graveyard with the statue of the Commendatore .,1
391519,It was made by the American game maker Bethesd...,0
213403,"In January and February of 1418 , he was paid ...",0


In [3]:
#Data processing
#df_train_raw['tokens'] = df_train_raw['original_text'].apply(lambda x: text_process(x))

In [4]:
def clean_tokenize_text(df):
    """
    tokenize the text from the articles
    """
    df['original_text'] = df['original_text'].astype(str)
    # YOUR CODE HERE
    custom_filter = [strip_tags,strip_multiple_whitespaces,
                    strip_punctuation]
    df['tokens'] = df['original_text'].apply(lambda x:preprocess_string(x, custom_filter))
    return df
def pos_tag(tok_list):
    return nltk.pos_tag(tok_list)
def counters(row):
    """
    This includes all the counting operations on the tokens
    """
    #part of speech counting
    pos_l = row['pos']
    nouns = 0
    verbs = 0
    Pnouns = 0
    adjectives = 0
    adverbs = 0
    for pair in pos_l:
        if pair[1][0] == 'N':
            nouns+=1
        if pair[1][0] == 'V':
            verbs+=1
        if pair[1] == 'NNP':
            Pnouns+=1
        if pair[1] == 'NNPS':
            Pnouns+=1
        if pair[1][0] == 'J':
            adjectives+=1
        if pair[1][0] == 'R':
            adverbs+=1 
    row['nouns'] = nouns 
    row['verbs'] = verbs 
    row['Pnouns'] = Pnouns 
    row['adjectives'] = adjectives 
    row['adverbs'] = adverbs 
    
    #number of words
    row['num_words'] = len(row['tokens'])
    
    #number of sylables
    syls = []
    h = Hyphenator('en_US')
    tok_l = row['tokens']
    for t in tok_l:
        syls.append(len(h.syllables(t))+1)
    syls = np.array(syls)
    row['syls'] = syls
    try:
        row['max_syls'] = np.max(syls)
        row['avg_syls'] = np.mean(syls)
        row['std_syls'] = np.std(syls)
    except:
        row['max_syls'] = np.nan
        row['avg_syls'] = np.nan
        row['std_syls'] = np.nan
    return row
def readability_scores(row):
    #https://pypi.org/project/textstat/
    text = row['original_text']
    row['flesch_score'] = textstat.flesch_reading_ease(text)
    row['flesch_grade_lvl'] = textstat.flesch_kincaid_grade(text)
    row['fog_grade_lvl'] = textstat.gunning_fog(text)
    row['ARI_grade'] = textstat.automated_readability_index(text)
    row['CLI_grade'] = textstat.coleman_liau_index(text)
    row['LWF_grade'] = textstat.linsear_write_formula(text)
    row['Dale-Chall_score'] = textstat.dale_chall_readability_score(text)
    row['combined_grade'] = textstat.text_standard(text, float_output=False)
    return row

In [5]:
#The "do stuff block"
#training data
df_train_raw = df_train_raw #for teppsting, lets work with a small set
df_tokenized = clean_tokenize_text(df_train_raw)
df_tokenized['pos'] = df_tokenized['tokens'].apply(lambda x: pos_tag(x))
df_tokenized = df_tokenized.apply(lambda row:counters(row), axis =1)
df_tokenized = df_tokenized.apply(lambda row:readability_scores(row), axis =1)



In [6]:
#df_tokenized[df_tokenized['label']==0]
df_train_raw = df_train_raw.sample(1000,random_state = 10)
df_train_raw ['label'].value_counts()

0    503
1    497
Name: label, dtype: int64

In [7]:
#df_tokenized.sample(10)

In [8]:
prep2 = pd.read_pickle('brosko_train_f.pkl')
merged = df_tokenized.merge(prep2, left_index=True, right_index=True, how='inner')
#merged.sample(10)

In [9]:
merged.columns

Index(['original_text_x', 'label_x', 'tokens', 'pos', 'nouns', 'verbs',
       'Pnouns', 'adjectives', 'adverbs', 'num_words', 'syls', 'max_syls',
       'avg_syls', 'std_syls', 'flesch_score', 'flesch_grade_lvl',
       'fog_grade_lvl', 'ARI_grade', 'CLI_grade', 'LWF_grade',
       'Dale-Chall_score', 'combined_grade', 'index', 'original_text_y',
       'label_y', 'clean_text', 'word_count', 'clean_text_no_stop',
       'word_count_no_stop', 'Nsyll', 'AoA_Kup_lem', 'Perc_known_lem'],
      dtype='object')

In [42]:
X_train = merged[['nouns', 'verbs',
       'Pnouns', 'adjectives', 'adverbs', 'num_words', 'max_syls',
       'avg_syls', 'std_syls', 'flesch_score', 'flesch_grade_lvl',
       'fog_grade_lvl', 'ARI_grade', 'CLI_grade', 'LWF_grade',
       'Dale-Chall_score','word_count_no_stop','Nsyll','AoA_Kup_lem','Perc_known_lem']]
#X_train = merged[['flesch_score', 'flesch_grade_lvl',
#       'fog_grade_lvl', 'ARI_grade', 'CLI_grade', 'LWF_grade',
#       'Dale-Chall_score']]
X_train = X_train.replace(np.nan, 0)
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
y_train = merged['label_x'].values
#y_train =y_train.reshape(len(y_train),1)

In [27]:
X= X_train
y = y_train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [43]:
knn = KNeighborsClassifier(n_neighbors = 700) #start with square root of N, to add a loop later
knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
accuracy_score(y_train, y_pred)

0.672707597512285

In [13]:


rf = RandomForestClassifier(n_estimators=100, max_depth = 16, max_features = None)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

0.6829656666715139

In [14]:
clf = svm.SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.6552416129829715

In [15]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
accuracy_score(y_test, y_pred)


0.6485305451742842

In [None]:
df_tokenized.to_pickle('df_tokenized.pkl')

In [18]:
#The "do stuff block"
#test data
df_test_raw = df_test_raw #for teppsting, lets work with a small set
df_tokenized_test = clean_tokenize_text(df_test_raw)
df_tokenized_test['pos'] = df_tokenized_test['tokens'].apply(lambda x: pos_tag(x))
df_tokenized_test = df_tokenized_test.apply(lambda row:counters(row), axis =1)
df_tokenized_test = df_tokenized_test.apply(lambda row:readability_scores(row), axis =1)
prep3 = pd.read_pickle('brosko_test_f.pkl')
merged_test = df_tokenized_test.merge(prep3, left_index=True, right_index=True, how='inner')
X_test = merged_test[['flesch_score', 'flesch_grade_lvl',
       'fog_grade_lvl', 'ARI_grade', 'CLI_grade', 'LWF_grade',
       'Dale-Chall_score']]
y_pred = knn.predict(X_test)

In [30]:
merged_test.to_pickle('test_data.pkl')

In [44]:
X_test = merged_test[['nouns', 'verbs',
       'Pnouns', 'adjectives', 'adverbs', 'num_words', 'max_syls',
       'avg_syls', 'std_syls', 'flesch_score', 'flesch_grade_lvl',
       'fog_grade_lvl', 'ARI_grade', 'CLI_grade', 'LWF_grade',
       'Dale-Chall_score','word_count_no_stop','Nsyll','AoA_Kup_lem','Perc_known_lem']]
X_test = X_test.replace(np.nan, 0)
X_test = scaler.transform(X_test)
y_pred = knn.predict(X_test)
df = pd.DataFrame()
df['id'] =merged_test.index
df['label'] = y_pred
df.to_csv('knn_textscores_only.csv',index = False)

array([[-1.53804494, -1.3207071 , -1.01237818, ..., -3.20056336,
        -3.82082126, -6.33106802],
       [-1.53804494, -1.3207071 , -1.01237818, ..., -3.20056336,
        -3.82082126, -6.33106802],
       [-1.53804494, -1.3207071 , -1.01237818, ..., -3.20056336,
        -3.82082126, -6.33106802],
       ...,
       [-1.34765019, -1.3207071 , -1.01237818, ..., -1.57604297,
        -1.82603962,  0.21604896],
       [-1.34765019, -1.3207071 , -1.01237818, ..., -1.57604297,
        -1.82603962,  0.21604896],
       [-1.34765019, -1.3207071 , -1.01237818, ..., -1.57604297,
        -1.82603962,  0.21604896]])