In [1]:
import re
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy as sc
from scipy import stats

In [2]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [3]:
test = pd.read_csv("test.csv")
test.head()

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


In [4]:
final = pd.read_csv("sample_submission.csv")
final

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,3.0,3.0,3.0,3.0,3.0,3.0
1,000BAD50D026,3.0,3.0,3.0,3.0,3.0,3.0
2,00367BB2546B,3.0,3.0,3.0,3.0,3.0,3.0


### EDA: Getting to know our data

In [5]:
#add a new column to reflect the length of article
length = []
for i in train['full_text']:
    leng = len(i.split())
    length.append(leng)

In [6]:
train['full_text'] = train["full_text"].replace(re.compile(r'[\n\r\t]'), ' ', regex=True)
test['full_text'] = test["full_text"].replace(re.compile(r'[\n\r\t]'), ' ', regex=True)

In [7]:
train['text_length']=length
train

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,text_length
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,261
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,533
2,00299B378633,"Dear, Principal If u change the school policy...",3.0,3.5,3.0,3.0,3.0,2.5,320
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0,728
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5,234
...,...,...,...,...,...,...,...,...,...
3906,FFD29828A873,I believe using cellphones in class for educat...,2.5,3.0,3.0,3.5,2.5,2.5,179
3907,FFD9A83B0849,"Working alone, students do not have to argue w...",4.0,4.0,4.0,4.0,3.5,3.0,465
3908,FFDC4011AC9C,"""A problem is a chance for you to do your best...",2.5,3.0,3.0,3.0,3.5,3.0,257
3909,FFE16D704B16,Many people disagree with Albert Schweitzer's ...,4.0,4.5,4.5,4.0,4.5,4.5,510


In [8]:
des_df = train[['text_id','cohesion','syntax','vocabulary','vocabulary','grammar','conventions']]
des_df

Unnamed: 0,text_id,cohesion,syntax,vocabulary,vocabulary.1,grammar,conventions
0,0016926B079C,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,2.5,2.5,3.0,3.0,2.0,2.5
2,00299B378633,3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,2.5,3.0,3.0,3.0,2.5,2.5
...,...,...,...,...,...,...,...
3906,FFD29828A873,2.5,3.0,3.0,3.0,2.5,2.5
3907,FFD9A83B0849,4.0,4.0,4.0,4.0,3.5,3.0
3908,FFDC4011AC9C,2.5,3.0,3.0,3.0,3.5,3.0
3909,FFE16D704B16,4.0,4.5,4.5,4.5,4.5,4.5


### Statistical Description for Train Dataset

In [9]:
des_train = des_df.select_dtypes(['int','float']).describe().T
#des_train = des_train.set_index('index')
reset_train = des_df.select_dtypes(['int','float']).describe().T.reset_index(drop=False) 
reset_train.style.background_gradient(cmap='Greens')

Unnamed: 0,index,count,mean,std,min,25%,50%,75%,max
0,cohesion,3911.0,3.127077,0.662542,1.0,2.5,3.0,3.5,5.0
1,syntax,3911.0,3.028254,0.644399,1.0,2.5,3.0,3.5,5.0
2,vocabulary,3911.0,3.235745,0.583148,1.0,3.0,3.0,3.5,5.0
3,vocabulary,3911.0,3.235745,0.583148,1.0,3.0,3.0,3.5,5.0
4,grammar,3911.0,3.032856,0.699841,1.0,2.5,3.0,3.5,5.0
5,conventions,3911.0,3.081053,0.67145,1.0,2.5,3.0,3.5,5.0


In [10]:
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.graph_objs as go
import pandas as pd
import numpy as np

In [11]:
plot_rows=3
plot_cols=2
fig = make_subplots(rows=plot_rows, cols=plot_cols)

In [12]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/yudichen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
# sample_train = train['full_text'][:50]

text_tag = []
for i in train['full_text']:
    i = word_tokenize(i)
    pos_tagger = nltk.pos_tag(i)
    word_tag = []
    for j in pos_tagger:
        (word,tag) = j
        word_tag.append(tag)
    text_tag.append(word_tag)

In [14]:
import nltk
from nltk.data import load
nltk.download('tagsets')
tagdict = load('help/tagsets/upenn_tagset.pickle')
tagdict.keys()

[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/yudichen/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


dict_keys(['LS', 'TO', 'VBN', "''", 'WP', 'UH', 'VBG', 'JJ', 'VBZ', '--', 'VBP', 'NN', 'DT', 'PRP', ':', 'WP$', 'NNPS', 'PRP$', 'WDT', '(', ')', '.', ',', '``', '$', 'RB', 'RBR', 'RBS', 'VBD', 'IN', 'FW', 'RP', 'JJR', 'JJS', 'PDT', 'MD', 'VB', 'WRB', 'NNP', 'EX', 'NNS', 'SYM', 'CC', 'CD', 'POS'])

In [15]:
tags = list(tagdict.keys())
tags.append("#")
tags

['LS',
 'TO',
 'VBN',
 "''",
 'WP',
 'UH',
 'VBG',
 'JJ',
 'VBZ',
 '--',
 'VBP',
 'NN',
 'DT',
 'PRP',
 ':',
 'WP$',
 'NNPS',
 'PRP$',
 'WDT',
 '(',
 ')',
 '.',
 ',',
 '``',
 '$',
 'RB',
 'RBR',
 'RBS',
 'VBD',
 'IN',
 'FW',
 'RP',
 'JJR',
 'JJS',
 'PDT',
 'MD',
 'VB',
 'WRB',
 'NNP',
 'EX',
 'NNS',
 'SYM',
 'CC',
 'CD',
 'POS',
 '#']

In [16]:
#list(tagdict.keys())
freq = pd.DataFrame(index=range(0,len(train)),columns=tags)
freq.iloc[:] = 0
freq = freq.astype('int')

In [17]:
from collections import Counter
d = []
for i in text_tag:
    d.append(Counter(i))

In [18]:
for i in range(len(train)):
    for key, value in d[i].items():
        freq[key][i] = value

In [19]:
verb = ["VB","VBG","VBD","VBN","VBP","VBZ"]
wh = ["WDT","WP","WRB","PDT","DT","WP$"] #determiner
connection = ["UH","RP","TO","IN","CC","MD","EX"]
adb = ["RB","RBR","RBS"]
pronoun = ["POS","PRP","PRP$"]
noun = ["NN","NNS","NNP","NNPS","LS"]
adj = ["JJ","JJR","JJS"]
fw = ["FW"] #foreign word
number = ['CD'] #number
punc = ["''","--","(",")",".",",","``","$","#","SYM"]

word_type = [verb,wh,connection,adb,pronoun, noun,adj,fw,number,punc]

In [20]:
#df['Fruit Total']= df.iloc[:, -4:-1].sum(axis=1)
train['verb'] = freq[verb].sum(axis=1)
train['wh'] = freq[wh].sum(axis=1)
train['connection'] = freq[connection].sum(axis=1)
train['adb'] = freq[adb].sum(axis=1)
train['pronoun'] = freq[pronoun].sum(axis=1)
train['noun'] = freq[noun].sum(axis=1)
train['adj'] = freq[adj].sum(axis=1)
train['fw'] = freq[fw].sum(axis=1)
train['number'] = freq[number].sum(axis=1)
train['punc'] = freq[punc].sum(axis=1)

In [21]:
train

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,text_length,verb,wh,connection,adb,pronoun,noun,adj,fw,number,punc
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,261,63,21,69,13,32,47,19,0,0,19
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,533,131,53,137,25,57,89,42,0,2,18
2,00299B378633,"Dear, Principal If u change the school policy...",3.0,3.5,3.0,3.0,3.0,2.5,320,71,32,91,15,32,68,21,0,0,26
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0,728,204,68,162,46,125,94,56,0,2,79
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5,234,51,34,42,5,30,59,12,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3906,FFD29828A873,I believe using cellphones in class for educat...,2.5,3.0,3.0,3.5,2.5,2.5,179,36,14,40,9,21,49,9,0,0,11
3907,FFD9A83B0849,"Working alone, students do not have to argue w...",4.0,4.0,4.0,4.0,3.5,3.0,465,93,49,132,27,17,103,41,0,5,32
3908,FFDC4011AC9C,"""A problem is a chance for you to do your best...",2.5,3.0,3.0,3.0,3.5,3.0,257,52,26,66,21,35,39,17,0,2,17
3909,FFE16D704B16,Many people disagree with Albert Schweitzer's ...,4.0,4.5,4.5,4.0,4.5,4.5,510,102,42,121,30,68,108,46,0,2,39


In [22]:
import gensim

In [23]:
text = []
for i in train.full_text:
    j = word_tokenize(i)
    text.append(j)


In [24]:
len(text)

3911

In [25]:
word2vec = gensim.models.Word2Vec(text, vector_size=1000, window=10,min_count=1)

In [26]:
word2vec.wv.key_to_index

{'to': 0,
 '.': 1,
 ',': 2,
 'the': 3,
 'and': 4,
 'you': 5,
 'a': 6,
 'that': 7,
 'is': 8,
 'in': 9,
 'they': 10,
 'of': 11,
 'have': 12,
 'be': 13,
 'I': 14,
 'can': 15,
 'it': 16,
 'do': 17,
 'for': 18,
 'because': 19,
 'are': 20,
 'not': 21,
 'people': 22,
 'school': 23,
 'or': 24,
 'students': 25,
 'with': 26,
 'will': 27,
 'your': 28,
 'we': 29,
 'their': 30,
 "n't": 31,
 'more': 32,
 'if': 33,
 'what': 34,
 'time': 35,
 'good': 36,
 'get': 37,
 'my': 38,
 'on': 39,
 'work': 40,
 'but': 41,
 'like': 42,
 'want': 43,
 'them': 44,
 'life': 45,
 'would': 46,
 'help': 47,
 'make': 48,
 'i': 49,
 'at': 50,
 'when': 51,
 'about': 52,
 'this': 53,
 'think': 54,
 "'s": 55,
 'some': 56,
 'so': 57,
 'all': 58,
 'from': 59,
 'was': 60,
 'how': 61,
 'one': 62,
 'other': 63,
 'something': 64,
 'there': 65,
 'know': 66,
 'need': 67,
 'should': 68,
 'things': 69,
 'me': 70,
 'he': 71,
 'going': 72,
 'person': 73,
 'better': 74,
 'go': 75,
 'just': 76,
 'by': 77,
 'example': 78,
 'take': 79,
 'a

In [27]:
def doc_vec(doc):
    doc_vector = []
    for i in doc:
        word_vec = []
        for j in i:
            if j in word2vec.wv.index_to_key:
                vec = word2vec.wv.get_vector(j)
                word_vec.append(vec)
        doc_vec = np.mean(word_vec,axis=0)
        doc_vector.append(doc_vec)
    return doc_vector

In [28]:
vector_list=[]
text_df = pd.DataFrame({'text':text})
text_df['doc_vec'] = text_df.apply(doc_vec)

In [29]:
x_train = np.asarray(text_df['doc_vec'].to_list())

In [30]:
x_train.shape

(3911, 1000)

In [31]:
#x_train = x_train.reshape(x_train.shape[0],x_train.shape[1],1)

In [32]:
y_train = np.asarray(train[['cohesion','syntax','vocabulary','phraseology','grammar','conventions']])

In [33]:
y_train.shape

(3911, 6)

In [37]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(n_estimators=500,max_depth=10, random_state=0)
regr.fit(x_train, y_train)

RandomForestRegressor(max_depth=10, n_estimators=500, random_state=0)

In [38]:
regr.predict(x_train)

array([[3.30203059, 3.23184278, 3.33131656, 3.30710024, 3.35401458,
        3.25549156],
       [2.49019276, 2.36190613, 2.77376823, 2.37300509, 2.24219912,
        2.42495688],
       [3.02054067, 3.0645779 , 3.12953803, 3.05014905, 3.01789748,
        2.95755926],
       ...,
       [2.93962723, 2.98180142, 3.16110128, 3.07827699, 3.17600514,
        3.00820637],
       [3.59000109, 3.63996559, 3.79425631, 3.61957787, 3.66021975,
        3.6749586 ],
       [3.20076999, 2.8210421 , 3.24318051, 3.00487587, 2.85530377,
        3.14644587]])

In [44]:
from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
#from keras.layers.merge import concatenate
from keras.models import Model
#from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [49]:
embedding_layer = Embedding(3911, 1000,
        weights=[x_train],
        input_length=100,
        trainable=False)
lstm_layer = LSTM(128, dropout=0.1, recurrent_dropout=0.1)

sequence_1_input = Input(shape=(100,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(100,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

merged = Dropout(0.1)(y1)

merged = Dense(12, activation='softmax')(merged)
merged = Dropout(0.1)(merged)

preds = Dense(6, activation='sigmoid')(merged)

model = Model(inputs=sequence_1_input, outputs=preds)
model.compile(loss='brmsprop',
        optimizer='adam',
        metrics=['acc'])

ValueError: Graph disconnected: cannot obtain value for tensor KerasTensor(type_spec=TensorSpec(shape=(None, 100), dtype=tf.int32, name='input_8'), name='input_8', description="created by layer 'input_8'") at layer "embedding_3". The following previous layers were accessed without issue: []

In [50]:
hist = model.fit(x_train, epochs=10)


Epoch 1/10


ValueError: in user code:

    File "/Users/yudichen/opt/anaconda3/envs/anly580/lib/python3.9/site-packages/keras/engine/training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "/Users/yudichen/opt/anaconda3/envs/anly580/lib/python3.9/site-packages/keras/engine/training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/yudichen/opt/anaconda3/envs/anly580/lib/python3.9/site-packages/keras/engine/training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "/Users/yudichen/opt/anaconda3/envs/anly580/lib/python3.9/site-packages/keras/engine/training.py", line 889, in train_step
        y_pred = self(x, training=True)
    File "/Users/yudichen/opt/anaconda3/envs/anly580/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/yudichen/opt/anaconda3/envs/anly580/lib/python3.9/site-packages/keras/engine/input_spec.py", line 200, in assert_input_compatibility
        raise ValueError(f'Layer "{layer_name}" expects {len(input_spec)} input(s),'

    ValueError: Layer "model" expects 2 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, 1000) dtype=float32>]
