In [143]:
import pandas as pd
import numpy as np
import sys
import os
import gensim
import spacy
import altair as alt
from hyphen import Hyphenator
from gensim.parsing.preprocessing import preprocess_string, STOPWORDS 
from gensim.parsing.preprocessing import strip_tags
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_numeric
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

nltk.download('averaged_perceptron_tagger')
cwd = os.getcwd()


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\afhar\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [137]:
!pip install textstat 
#not the right way to do this, but doing for testing
import textstat



In [150]:
#import data
data_dir = cwd + '\\data\\'
df_train_raw = pd.read_csv(data_dir+'WikiLarge_Train.csv')
df_train_raw.sample(10)

Unnamed: 0,original_text,label
138349,It is the capital of the canton of Vaud and of the district of Lausanne .,1
288578,"The two-dimensional analogue figure is a polygon , and the three-dimensional one is a polyhedron .",0
220934,Television shows,0
99083,"Terriers locate the den or set of the target animal and then bolt , capture , or kill the animal .",1
319825,It is made from chromium ore and is part of the process for making chromium metal .,0
363599,The books were first published in 1988 .,0
65139,Ervillers is a commune in the Pas-de-Calais department in the Nord-Pas-de-Calais region of France .,1
364893,"41,336 houses were badly damaged",0
298867,It is one of the 83 first French departments made during the French revolution .,0
37673,"Minagawa lived alone in the Momochi apartment building in Nishijin Sawara Ward , Fukuoka , near Seinan Gakuin University , until 2005 , when she moved to Keijuen , a special nursing home in her native Akaike .",1


In [84]:
#Data processing
#df_train_raw['tokens'] = df_train_raw['original_text'].apply(lambda x: text_process(x))

In [140]:
def clean_tokenize_text(df):
    """
    tokenize the text from the articles
    """
    df['original_text'] = df['original_text'].astype(str)
    # YOUR CODE HERE
    custom_filter = [strip_tags,strip_multiple_whitespaces,
                    strip_punctuation]
    df['tokens'] = df['original_text'].apply(lambda x:preprocess_string(x, custom_filter))
    return df
def pos_tag(tok_list):
    return nltk.pos_tag(tok_list)
def counters(row):
    """
    This includes all the counting operations on the tokens
    """
    #part of speech counting
    pos_l = row['pos']
    nouns = 0
    verbs = 0
    Pnouns = 0
    adjectives = 0
    adverbs = 0
    for pair in pos_l:
        if pair[1][0] == 'N':
            nouns+=1
        if pair[1][0] == 'V':
            verbs+=1
        if pair[1] == 'NNP':
            Pnouns+=1
        if pair[1] == 'NNPS':
            Pnouns+=1
        if pair[1][0] == 'J':
            adjectives+=1
        if pair[1][0] == 'R':
            adverbs+=1 
    row['nouns'] = nouns 
    row['verbs'] = verbs 
    row['Pnouns'] = Pnouns 
    row['adjectives'] = adjectives 
    row['adverbs'] = adverbs 
    
    #number of words
    row['num_words'] = len(row['tokens'])
    
    #number of sylables
    syls = []
    h = Hyphenator('en_US')
    tok_l = row['tokens']
    for t in tok_l:
        syls.append(len(h.syllables(t))+1)
    syls = np.array(syls)
    row['syls'] = syls
    try:
        row['max_syls'] = np.max(syls)
        row['avg_syls'] = np.mean(syls)
        row['std_syls'] = np.std(syls)
    except:
        row['max_syls'] = np.nan
        row['avg_syls'] = np.nan
        row['std_syls'] = np.nan
    return row
def readability_scores(row):
    #https://pypi.org/project/textstat/
    text = row['original_text']
    row['flesch_score'] = textstat.flesch_reading_ease(text)
    row['flesch_grade_lvl'] = textstat.flesch_kincaid_grade(text)
    row['fog_grade_lvl'] = textstat.gunning_fog(text)
    row['ARI_grade'] = textstat.automated_readability_index(text)
    row['CLI_grade'] = textstat.coleman_liau_index(text)
    row['LWF_grade'] = textstat.linsear_write_formula(text)
    row['Dale-Chall_score'] = textstat.dale_chall_readability_score(text)
    row['combined_grade'] = textstat.text_standard(text, float_output=False)
    return row

In [159]:
#The "do stuff block"
df_train_raw = df_train_raw.sample(1000,random_state = 10) #for testing, lets work with a small set
df_tokenized = clean_tokenize_text(df_train_raw)
df_tokenized['pos'] = df_tokenized['tokens'].apply(lambda x: pos_tag(x))
df_tokenized = df_tokenized.apply(lambda row:counters(row), axis =1)
df_tokenized = df_tokenized.apply(lambda row:readability_scores(row), axis =1)

In [158]:
#df_tokenized[df_tokenized['label']==0]
df_train_raw = df_train_raw.sample(1000,random_state = 10)
df_train_raw ['label'].value_counts()

0    502
1    498
Name: label, dtype: int64

In [161]:
df_tokenized.sample(10)

Unnamed: 0,original_text,label,tokens,pos,nouns,verbs,Pnouns,adjectives,adverbs,num_words,...,avg_syls,std_syls,flesch_score,flesch_grade_lvl,fog_grade_lvl,ARI_grade,CLI_grade,LWF_grade,Dale-Chall_score,combined_grade
390957,"Norfolk Island -LRB- Norfuk : Norfuk Ailen -RRB- is a territory of Australia . It is located in the South Pacific Ocean between Australia , New Zealand , and New Caledonia .",0,"[Norfolk, Island, LRB, Norfuk, Norfuk, Ailen, RRB, is, a, territory, of, Australia, It, is, located, in, the, South, Pacific, Ocean, between, Australia, New, Zealand, and, New, Caledonia]","[(Norfolk, NNP), (Island, NNP), (LRB, NNP), (Norfuk, NNP), (Norfuk, NNP), (Ailen, NNP), (RRB, NNP), (is, VBZ), (a, DT), (territory, NN), (of, IN), (Australia, NNP), (It, PRP), (is, VBZ), (located, VBN), (in, IN), (the, DT), (South, NNP), (Pacific, NNP), (Ocean, NNP), (between, IN), (Australia, NNP), (New, NNP), (Zealand, NNP), (and, CC), (New, NNP), (Caledonia, NNP)]",17,3,16,0,0,27,...,2.037037,1.035713,40.85,10.9,11.33,10.1,10.72,9.0,8.98,10th and 11th grade
256866,Two mirror images of a molecule that can not be superimposed onto each other are referred to as enantiomers or optical isomers .,0,"[Two, mirror, images, of, a, molecule, that, can, not, be, superimposed, onto, each, other, are, referred, to, as, enantiomers, or, optical, isomers]","[(Two, CD), (mirror, NN), (images, NNS), (of, IN), (a, DT), (molecule, NN), (that, WDT), (can, MD), (not, RB), (be, VB), (superimposed, VBN), (onto, IN), (each, DT), (other, JJ), (are, VBP), (referred, VBN), (to, TO), (as, IN), (enantiomers, NNS), (or, CC), (optical, JJ), (isomers, NNS)]",5,4,0,2,1,22,...,2.090909,1.202614,40.69,13.1,14.25,12.3,10.39,14.5,10.47,10th and 11th grade
75635,Equipment and environment needed to prepare the dish,1,"[Equipment, and, environment, needed, to, prepare, the, dish]","[(Equipment, NN), (and, CC), (environment, NN), (needed, VBN), (to, TO), (prepare, VB), (the, DT), (dish, NN)]",3,2,0,0,0,8,...,2.25,1.299038,46.44,8.8,8.2,9.1,13.01,4.0,9.95,8th and 9th grade
104421,"Otto von Bismarck -LRB- 1815 -- 1898 -RRB- , German statesman of the 19th century",1,"[Otto, von, Bismarck, LRB, 1815, 1898, RRB, German, statesman, of, the, 19th, century]","[(Otto, NNP), (von, NNP), (Bismarck, NNP), (LRB, NNP), (1815, CD), (1898, CD), (RRB, JJ), (German, JJ), (statesman, NN), (of, IN), (the, DT), (19th, JJ), (century, NN)]",6,0,4,3,0,13,...,1.923077,0.828487,49.82,9.5,11.35,9.3,8.63,8.5,10.35,8th and 9th grade
321670,"Soul Free , 2003",0,"[Soul, Free, 2003]","[(Soul, NNP), (Free, JJ), (2003, CD)]",1,0,1,1,0,3,...,2.0,0.0,93.81,0.9,1.2,0.5,-2.38,1.0,0.15,0th and 1st grade
252743,"Saint-Blaise , Alpes-Maritimes is a commune .",0,"[Saint, Blaise, Alpes, Maritimes, is, a, commune]","[(Saint, NNP), (Blaise, NNP), (Alpes, NNP), (Maritimes, NNP), (is, VBZ), (a, DT), (commune, NN)]",5,1,4,0,0,7,...,2.0,0.755929,15.64,12.3,2.0,17.8,18.88,3.5,10.2,12th and 13th grade
141229,Bryce was sworn in on 5 September 2008 .,1,"[Bryce, was, sworn, in, on, 5, September, 2008]","[(Bryce, NNP), (was, VBD), (sworn, VBN), (in, IN), (on, IN), (5, CD), (September, NNP), (2008, CD)]",2,2,2,0,0,8,...,1.625,0.695971,80.28,4.1,3.2,1.4,2.86,4.5,0.4,4th and 5th grade
378195,Street Parade in Zürich,0,"[Street, Parade, in, Zürich]","[(Street, NNP), (Parade, NNP), (in, IN), (Zürich, NNP)]",3,0,3,0,0,4,...,2.0,0.707107,92.8,1.3,1.6,4.1,5.8,1.0,0.2,0th and 1st grade
124087,Image : Illu repdt male .,1,"[Image, Illu, repdt, male]","[(Image, NN), (Illu, NNP), (repdt, VBZ), (male, NN)]",3,1,1,0,0,4,...,2.0,0.0,33.58,9.6,1.6,4.1,2.9,2.0,11.73,1st and 2nd grade
414030,"The Charlotte Bobcats are a team in the National Basketball Association in Charlotte , North Carolina .",0,"[The, Charlotte, Bobcats, are, a, team, in, the, National, Basketball, Association, in, Charlotte, North, Carolina]","[(The, DT), (Charlotte, NNP), (Bobcats, NNP), (are, VBP), (a, DT), (team, NN), (in, IN), (the, DT), (National, NNP), (Basketball, NNP), (Association, NNP), (in, IN), (Charlotte, NNP), (North, NNP), (Carolina, NNP)]",9,1,8,0,0,15,...,2.266667,1.388844,22.41,13.9,16.67,13.4,15.01,12.5,10.7,12th and 13th grade
