### 1.1 Loading Data

In [124]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
#import seaborn as sns
import sklearn
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.utils import shuffle
from sklearn.metrics import r2_score
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [125]:
import pickle
import bz2

In [126]:
infile=bz2.BZ2File("data_pickle_compressed","rb")
data=pickle.load(infile)
infile.close()

In [127]:
y=data["project_is_approved"]
data=data.drop(columns=["project_is_approved"])

In [128]:
Xtrain,Xtest,ytrain,ytest= train_test_split(data,np.array(y).reshape(len(y),1),test_size=0.2,random_state=42,stratify=y)
Xtrain,Xdev,ytrain,ydev= train_test_split(Xtrain,ytrain,test_size=0.2,random_state=42,stratify=ytrain)  

In [129]:
essays=TfidfVectorizer(min_df=10)
essays.fit(Xtrain["essay"])
essay_tfidf=pd.DataFrame(essays.transform(Xtrain["essay"]).toarray(),columns=list(essays.get_feature_names()))


In [130]:
dev_essay_tfidf=pd.DataFrame(essays.transform(Xdev["essay"]).toarray(),columns=list(essays.get_feature_names()))

In [131]:
test_essay_tfidf=pd.DataFrame(essays.transform(Xtest["essay"]).toarray(),columns=list(essays.get_feature_names()))

In [132]:
#state
state = CountVectorizer(encoding='string',decode_error='ignore',lowercase=False, binary=True)
state.fit(Xtrain["school_state"])
school_state_onehot=pd.DataFrame(state.transform(Xtrain["school_state"]).toarray(),columns=list(state.get_feature_names()))
dev_school_state_onehot=pd.DataFrame(state.transform(Xdev["school_state"]).toarray(),columns=list(state.get_feature_names()))
test_school_state_onehot=pd.DataFrame(state.transform(Xtest["school_state"]).toarray(),columns=list(state.get_feature_names()))

In [133]:
#teacher_prefix
teacher_prefix = CountVectorizer(encoding='string',decode_error='ignore',lowercase=False, binary=True)
teacher_prefix.fit(Xtrain["teacher_prefix"])
teacher_prefix_onehot=pd.DataFrame(teacher_prefix.transform(Xtrain["teacher_prefix"]).toarray(),columns=list(teacher_prefix.get_feature_names()))
dev_teacher_prefix_onehot=pd.DataFrame(teacher_prefix.transform(Xdev["teacher_prefix"]).toarray(),columns=list(teacher_prefix.get_feature_names()))
test_teacher_prefix_onehot=pd.DataFrame(teacher_prefix.transform(Xtest["teacher_prefix"]).toarray(),columns=list(teacher_prefix.get_feature_names()))

In [134]:
#project_grade_category

project_grade_category = CountVectorizer(encoding='string',decode_error='ignore',lowercase=False, binary=True)
project_grade_category.fit(Xtrain["project_grade_category"])
project_grade_category_onehot=pd.DataFrame(project_grade_category.transform(Xtrain["project_grade_category"]).toarray(),columns=list(project_grade_category.get_feature_names()))
dev_project_grade_category_onehot=pd.DataFrame(project_grade_category.transform(Xdev["project_grade_category"]).toarray(),columns=list(project_grade_category.get_feature_names()))
test_project_grade_category_onehot=pd.DataFrame(project_grade_category.transform(Xtest["project_grade_category"]).toarray(),columns=list(project_grade_category.get_feature_names()))

In [135]:
#clean_categories

clean_categories = CountVectorizer(encoding='string',decode_error='ignore',lowercase=False, binary=True)
clean_categories.fit(Xtrain["clean_categories"])
clean_categories_onehot=pd.DataFrame(clean_categories.transform(Xtrain["clean_categories"]).toarray(),columns=list(clean_categories.get_feature_names()))
dev_clean_categories_onehot=pd.DataFrame(clean_categories.transform(Xdev["clean_categories"]).toarray(),columns=list(clean_categories.get_feature_names()))
test_clean_categories_onehot=pd.DataFrame(clean_categories.transform(Xtest["clean_categories"]).toarray(),columns=list(clean_categories.get_feature_names()))

In [136]:
#clean_subcategories

clean_subcategories = CountVectorizer(encoding='string',decode_error='ignore',lowercase=False, binary=True)
clean_subcategories.fit(Xtrain["clean_subcategories"])
clean_subcategories_onehot=pd.DataFrame(clean_subcategories.transform(Xtrain["clean_subcategories"]).toarray(),columns=list(clean_subcategories.get_feature_names()))
dev_clean_subcategories_onehot=pd.DataFrame(clean_subcategories.transform(Xdev["clean_subcategories"]).toarray(),columns=list(clean_subcategories.get_feature_names()))
test_clean_subcategories_onehot=pd.DataFrame(clean_subcategories.transform(Xtest["clean_subcategories"]).toarray(),columns=list(clean_subcategories.get_feature_names()))


In [137]:
#teacher_number_of_previously_posted_projects
num_projects=StandardScaler()
num_projects.fit(Xtrain["teacher_number_of_previously_posted_projects"].values.reshape(-1,1))
teacher_number_of_previously_posted_projects=num_projects.transform(Xtrain["teacher_number_of_previously_posted_projects"].values.reshape(-1,1))
dev_teacher_number_of_previously_posted_projects=num_projects.transform(Xdev["teacher_number_of_previously_posted_projects"].values.reshape(-1,1))
test_teacher_number_of_previously_posted_projects=num_projects.transform(Xtest["teacher_number_of_previously_posted_projects"].values.reshape(-1,1))
teacher_number_of_previously_posted_projects=pd.DataFrame(teacher_number_of_previously_posted_projects,columns=["previous_projects"])
dev_teacher_number_of_previously_posted_projects=pd.DataFrame(dev_teacher_number_of_previously_posted_projects,columns=["previous_projects"])
test_teacher_number_of_previously_posted_projects=pd.DataFrame(test_teacher_number_of_previously_posted_projects,columns=["previous_projects"])

In [138]:
#price
prices=StandardScaler()
prices.fit(Xtrain["price"].values.reshape(-1,1))
price=prices.transform(Xtrain["price"].values.reshape(-1,1))
dev_price=prices.transform(Xdev["price"].values.reshape(-1,1))
test_price=prices.transform(Xtest["price"].values.reshape(-1,1))
price=pd.DataFrame(columns=["price"],data=price)
dev_price=pd.DataFrame(columns=["price"],data=dev_price)
test_price=pd.DataFrame(columns=["price"],data=test_price)

In [139]:
train_tfidf= pd.concat([school_state_onehot,teacher_prefix_onehot,project_grade_category_onehot,clean_categories_onehot,clean_subcategories_onehot,price, teacher_number_of_previously_posted_projects,essay_tfidf], axis=1)
dev_tfidf= pd.concat([dev_school_state_onehot,dev_teacher_prefix_onehot,dev_project_grade_category_onehot,dev_clean_categories_onehot,dev_clean_subcategories_onehot,dev_price, dev_teacher_number_of_previously_posted_projects,dev_essay_tfidf], axis=1)
test_tfidf= pd.concat([test_school_state_onehot,test_teacher_prefix_onehot,test_project_grade_category_onehot,test_clean_categories_onehot,test_clean_subcategories_onehot,test_price, test_teacher_number_of_previously_posted_projects,test_essay_tfidf], axis=1)

In [140]:
test_tfidf= pd.concat([test_school_state_onehot,test_teacher_prefix_onehot,test_project_grade_category_onehot,test_clean_categories_onehot,test_clean_subcategories_onehot,test_price, test_teacher_number_of_previously_posted_projects,test_essay_tfidf], axis=1)

In [141]:
outfile=bz2.BZ2File("train_tfidf_1","wb")
pickle.dump(train_tfidf[0:len(train_tfidf)//2],outfile)
outfile.close()


In [142]:
outfile=bz2.BZ2File("train_tfidf_2","wb")
pickle.dump(train_tfidf[len(train_tfidf)//2:],outfile)
outfile.close()


In [143]:
outfile=bz2.BZ2File("dev_tfidf","wb")
pickle.dump(dev_tfidf,outfile)
outfile.close()


In [144]:
outfile=bz2.BZ2File("test_tfidf","wb")
pickle.dump(test_tfidf,outfile)
outfile.close()


In [145]:
outfile=bz2.BZ2File("ytrain","wb")
pickle.dump(ytrain,outfile)
outfile.close()


In [146]:
outfile=bz2.BZ2File("ydev","wb")
pickle.dump(ydev,outfile)
outfile.close()


In [147]:
outfile=bz2.BZ2File("ytest","wb")
pickle.dump(ytest,outfile)
outfile.close()


In [148]:
outfile=bz2.BZ2File("Xtest","wb")
pickle.dump(Xtest,outfile)
outfile.close()


In [149]:
with open('glove_vectors', 'rb') as f:
    model = pickle.load(f)
    glove_words =  set(model.keys())

In [150]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


In [151]:
tfidf_model=TfidfVectorizer()
tfidf_model.fit(data["essay"].values)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [152]:
dict_idf=dict(zip(tfidf_model.get_feature_names(),tfidf_model.idf_))
tfidf_words = set(tfidf_model.get_feature_names())

In [153]:

tfidfw2v_train=[]
for para in Xtrain["essay"].values:
    vector = np.zeros(300)
    tf_idf_weight=0
    for word in para.split():
        if (word in glove_words) and (word in tfidf_words):
            vec=model[word]
            tf_idf=dict_idf[word]*(para.count(word)/len(para.split()))
            vector=vector+(vec*tf_idf)
            tf_idf_weight=tf_idf_weight+tf_idf
    if tf_idf_weight!=0:
        vector=vector/tf_idf_weight
    tfidfw2v_train.append(vector)
    
            
        

In [154]:
tfidfw2v_test=[]
for para in Xtest["essay"].values:
    vector = np.zeros(300)
    tf_idf_weight=0
    for word in para.split():
        if (word in glove_words) and (word in tfidf_words):
            vec=model[word]
            tf_idf=dict_idf[word]*(para.count(word)/len(para.split()))
            vector=vector+(vec*tf_idf)
            tf_idf_weight=tf_idf_weight+tf_idf
    if tf_idf_weight!=0:
        vector=vector/tf_idf_weight
    tfidfw2v_test.append(vector)
    
            
        

In [155]:
tfidfw2v_dev=[]
for para in Xdev["essay"].values:
    vector = np.zeros(300)
    tf_idf_weight=0
    for word in para.split():
        if (word in glove_words) and (word in tfidf_words):
            vec=model[word]
            tf_idf=dict_idf[word]*(para.count(word)/len(para.split()))
            vector=vector+(vec*tf_idf)
            tf_idf_weight=tf_idf_weight+tf_idf
    if tf_idf_weight!=0:
        vector=vector/tf_idf_weight
    tfidfw2v_dev.append(vector)
    
            
        

In [156]:
tfidfw2v_train=pd.DataFrame(np.array(tfidfw2v_train))
tfidfw2v_test=pd.DataFrame(np.array(tfidfw2v_test))
tfidfw2v_dev=pd.DataFrame(np.array(tfidfw2v_dev))

In [157]:
train_tfidfw2v= pd.concat([school_state_onehot,teacher_prefix_onehot,project_grade_category_onehot,clean_categories_onehot,clean_subcategories_onehot,price, teacher_number_of_previously_posted_projects,tfidfw2v_train], axis=1)
dev_tfidfw2v= pd.concat([dev_school_state_onehot,dev_teacher_prefix_onehot,dev_project_grade_category_onehot,dev_clean_categories_onehot,dev_clean_subcategories_onehot,dev_price, dev_teacher_number_of_previously_posted_projects,tfidfw2v_dev], axis=1)
test_tfidfw2v= pd.concat([test_school_state_onehot,test_teacher_prefix_onehot,test_project_grade_category_onehot,test_clean_categories_onehot,test_clean_subcategories_onehot,test_price, test_teacher_number_of_previously_posted_projects,tfidfw2v_test], axis=1)

In [167]:
outfile=bz2.BZ2File("train_tfidfw2v","wb")
pickle.dump(train_tfidfw2v,outfile)
outfile.close()

In [168]:
outfile=bz2.BZ2File("dev_tfidfw2v","wb")
pickle.dump(dev_tfidfw2v,outfile)
outfile.close()

In [169]:
outfile=bz2.BZ2File("test_tfidfw2v","wb")
pickle.dump(test_tfidfw2v,outfile)
outfile.close()

In [170]:
dev_tfidfw2v.shape

(17480, 401)

In [162]:
tfidfw2v_train.shape

(69918, 300)

In [163]:
train_tfidf.shape

(69918, 14041)