# Initial Setup

In [1]:
#imports
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import joblib

## Loading the Data

In [2]:
# Load data
df = pd.read_csv('TextData/phrases.csv', low_memory=False)
df.head(5)

Unnamed: 0,id,context,correctness,growth,text
0,1,homework,correct,8,good job answering all the questions
1,2,paper,correct,9,you are a natural writer
2,3,homework,partial,5,received
3,4,paper,incorrect,2,bad writing
4,5,homework,correct,8,keep up the good work!


# Build Classifier

In [3]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(df['text']).toarray()

def get_trained(col):
    x_train, x_test, y_train, y_test = train_test_split(df['text'], df[col], random_state = 0)
    return (x_train, y_train)

In [4]:
#generating growth mindset check
threshold = 5
temp = df['growth'].apply(lambda x : x>=threshold)
df.insert(loc=df.shape[1],column='growth_mindset',value=temp)
df.head()

Unnamed: 0,id,context,correctness,growth,text,growth_mindset
0,1,homework,correct,8,good job answering all the questions,True
1,2,paper,correct,9,you are a natural writer,True
2,3,homework,partial,5,received,True
3,4,paper,incorrect,2,bad writing,False
4,5,homework,correct,8,keep up the good work!,True


In [5]:
#training for context
ctx_train = get_trained('context')
ctx_count_vect = CountVectorizer()
ctx_x_train_counts = ctx_count_vect.fit_transform(ctx_train[0])
ctx_tfidf_transformer = TfidfTransformer()
ctx_x_train_tfidf = ctx_tfidf_transformer.fit_transform(ctx_x_train_counts)
ctx_clf = MultinomialNB().fit(ctx_x_train_tfidf, ctx_train[1])

#training for correctness
cor_train = get_trained('correctness')
cor_count_vect = CountVectorizer()
cor_x_train_counts = cor_count_vect.fit_transform(cor_train[0])
cor_tfidf_transformer = TfidfTransformer()
cor_x_train_tfidf = cor_tfidf_transformer.fit_transform(cor_x_train_counts)
cor_clf = MultinomialNB().fit(cor_x_train_tfidf, cor_train[1])

#training for growth mindset
grow_train = get_trained('growth_mindset')
grow_count_vect = CountVectorizer()
grow_x_train_counts = grow_count_vect.fit_transform(grow_train[0])
grow_tfidf_transformer = TfidfTransformer()
grow_x_train_tfidf = grow_tfidf_transformer.fit_transform(grow_x_train_counts)
grow_clf = MultinomialNB().fit(grow_x_train_tfidf, grow_train[1])

def classify(text):
    return {'text':text,
        'context':ctx_clf.predict(ctx_count_vect.transform([text]))[0],
           'correctness':cor_clf.predict(cor_count_vect.transform([text]))[0],
            'growth':grow_clf.predict(grow_count_vect.transform([text]))[0]}

In [6]:
print(classify('terrible paper'))

{'text': 'terrible paper', 'context': 'paper', 'correctness': 'incorrect', 'growth': False}


In [7]:
print(classify('great paper'))

{'text': 'great paper', 'context': 'paper', 'correctness': 'correct', 'growth': True}


In [8]:
#export
joblib.dump([ctx_train, cor_train, grow_train], 'model.pkl', compress=9)

['model.pkl']

# Growth Mindset Score

In [9]:
def get_rec_from_score(phrase):
    filtered = df.loc[df['context'] == classify(phrase)['context']]
    filtered = filtered.loc[df['correctness'] == classify(phrase)['correctness']]
    return filtered.sort_values('growth', ascending=False)

In [10]:
get_rec_from_score('terrible paper').head(5)

Unnamed: 0,id,context,correctness,growth,text,growth_mindset
51,52,paper,incorrect,10,"Although paper is not upto par, you put a good...",True
52,53,paper,incorrect,9,"The paper doesn't meet expectations, but I am ...",True
48,49,paper,incorrect,8,Thank you for submitting. You need to focus on...,True
23,24,paper,incorrect,4,follow the rubric!,False
19,20,paper,incorrect,3,Poor grammar,False


# Phrase Similarity


In [11]:
sw = stopwords.words('english') 
def get_cosine_dist(A, B):
    a_tokens = word_tokenize(A)
    
    b_tokens = word_tokenize(B)
    l1 =[];l2 =[]
    a_set = {w for w in a_tokens if not w in sw} 
    b_set = {w for w in b_tokens if not w in sw}
    rvector = a_set.union(b_set)
    for w in rvector:
        if w in a_set: l1.append(1) 
        else: l1.append(0)
        if w in b_set: l2.append(1)
        else: l2.append(0)
    c = 0
    for i in range(len(rvector)):
        c+= l1[i]*l2[i]
    return c / float((sum(l1)*sum(l2))**0.5)


In [12]:
get_cosine_dist("hello world", "hi world")


0.5

In [13]:
get_cosine_dist("I like Java", "I hate Java")


0.6666666666666666

In [14]:
get_cosine_dist("Some other school", "Kean University")

0.0

In [15]:
def get_rec_from_similairty(phrase):
    filtered = df.loc[df['context'] == classify(phrase)['context']]
    filtered = filtered.loc[df['correctness'] == classify(phrase)['correctness']]
    temp = filtered['text'].apply(lambda x: get_cosine_dist(x, B=phrase))
    filtered.insert(loc=filtered.shape[1],column='cosine_dist',value=temp)
    return filtered.sort_values('cosine_dist', ascending=False)


In [16]:
get_rec_from_similairty('terrible paper').head(5)

Unnamed: 0,id,context,correctness,growth,text,growth_mindset,cosine_dist
11,12,paper,incorrect,1,terrible paper,False,1.0
39,40,paper,incorrect,2,bad paper,False,0.5
56,57,paper,incorrect,1,awful paper,False,0.5
43,44,paper,incorrect,1,"your paper is bad, and you should feel bad",False,0.353553
55,56,paper,incorrect,1,I don't like this paper,False,0.353553


# Final Reccomendation

In [17]:
def get_recs(phrase):
    filtered = df.loc[df['context'] == classify(phrase)['context']]
    filtered = filtered.loc[df['correctness'] == classify(phrase)['correctness']]
    temp = filtered['text'].apply(lambda x: get_cosine_dist(x, B=phrase))
    filtered.insert(loc=filtered.shape[1],column='cosine_dist',value=temp)
    return filtered.sort_values(['growth','cosine_dist'], ascending=(False, False))


In [18]:
get_recs('terrible paper').head(1)['text'].item()

'Although paper is not upto par, you put a good effort'