In [88]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score as auc
from sklearn.model_selection import StratifiedKFold
from scipy.stats import pearsonr

In [2]:
train=pd.read_csv('data/train.csv')
test=pd.read_csv('data/test.csv')
resources=pd.read_csv('data/resources.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
print(train.dtypes)
print(test.dtypes)
print(resources.dtypes)

id                                              object
teacher_id                                      object
teacher_prefix                                  object
school_state                                    object
project_submitted_datetime                      object
project_grade_category                          object
project_subject_categories                      object
project_subject_subcategories                   object
project_title                                   object
project_essay_1                                 object
project_essay_2                                 object
project_essay_3                                 object
project_essay_4                                 object
project_resource_summary                        object
teacher_number_of_previously_posted_projects     int64
project_is_approved                              int64
dtype: object
id                                              object
teacher_id                                      obj

In [4]:
tid=['p036502']

In [5]:
train[train['id'].isin(tid)]

Unnamed: 0,id,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,project_is_approved
0,p036502,484aaf11257089a66cfedc9461c6bd0a,Ms.,NV,2016-11-18 14:45:59,Grades PreK-2,Literacy & Language,Literacy,Super Sight Word Centers,Most of my kindergarten students come from low...,I currently have a differentiated sight word c...,,,My students need 6 Ipod Nano's to create and d...,26,1


In [6]:
resources['id'].isin(tid)

0          False
1          False
2          False
3          False
4          False
5          False
6          False
7          False
8          False
9          False
10         False
11         False
12         False
13         False
14         False
15         False
16         False
17         False
18         False
19         False
20         False
21         False
22         False
23         False
24         False
25         False
26         False
27         False
28         False
29         False
           ...  
1541242    False
1541243    False
1541244    False
1541245    False
1541246    False
1541247    False
1541248    False
1541249    False
1541250    False
1541251    False
1541252    False
1541253    False
1541254    False
1541255    False
1541256    False
1541257    False
1541258    False
1541259    False
1541260    False
1541261    False
1541262    False
1541263    False
1541264    False
1541265    False
1541266    False
1541267    False
1541268    False
1541269    Fal

In [7]:
df=pd.concat([train,test],sort=True)

In [8]:
df.loc[df.project_essay_3.isna(),['project_essay_2','project_essay_3']]=df.loc[df.project_essay_4.isna(),['project_essay_3','project_essay_2']].values
df[['project_essay_2','project_essay_4']]=df[['project_essay_2','project_essay_4']].fillna("")
df['project_essay_1']=df.apply(lambda x:x['project_essay_1']+x['project_essay_2'],axis=1)
df['project_essay_2']=df.apply(lambda x:x['project_essay_3']+x['project_essay_4'],axis=1)
df=df.drop(['project_essay_3','project_essay_4'],axis=1)


In [9]:
resources['total_price']=resources['quantity'] * resources['price']

In [10]:
resources.dtypes

id              object
description     object
quantity         int64
price          float64
total_price    float64
dtype: object

In [11]:
R=resources.groupby('id').agg({'description':'count','quantity':'sum','price':'sum','total_price':'sum'})\
    .rename(columns={'description':'items'})
R['avg_price']=R['total_price']/R['quantity']

for func in ['min','max','mean','std']:
    R=R.join(resources.groupby('id').agg({'quantity':func,'price':func,'total_price':func}).\
           rename(columns={'quantity':'quantity_'+func,'price':'price_'+func,'total_price':'total_price_'+func}))

R=R.join(resources.groupby('id').agg({'description':lambda x:' '.join(x.astype(str))}).rename(
    columns={'description':'resource_description'}))

df=df.join(R,on='id')

df['price_category']=pd.cut(df['total_price'], [0, 50, 100, 250, 500, 1000,np.inf])

for c in ['quantity', 'price', 'total_price']:
    df['max%s_min%s'%(c,c)] = df['%s_max'%c] - df['%s_min'%c]

In [35]:
le = LabelEncoder()
df['teacher_id'] = le.fit_transform(df['teacher_id'])
df['teacher_gender_unknown'] = df.teacher_prefix.apply(lambda x:int(x not in ['Ms.', 'Mrs.', 'Mr.']))

statFeatures = []
for col in ['school_state', 'teacher_id', 'teacher_prefix', 'teacher_gender_unknown', 'project_grade_category', 'project_subject_categories', 'project_subject_subcategories', 'teacher_number_of_previously_posted_projects']:
    Stat = df[['id', col]].groupby(col).agg('count').rename(columns={'id':col+'_stat'})
    Stat /= Stat.sum()
    df = df.join(Stat, on=col)

In [122]:
%%time
numFeatures=[df.columns[i] for i,j in enumerate(df.dtypes) if j == 'float64' and not (df.columns[i]=='project_is_approved') ]
T2 = df[numFeatures+['project_is_approved']].copy()
Ttr = T2[-pd.isna(df.project_is_approved)]
Tar_tr = Ttr['project_is_approved'].values
n = 10
inx = [np.random.randint(0, Ttr.shape[0], int(Ttr.shape[0]/n)) for k in range(n)]
# inx is used for crossvalidation of calculating the correlation and p-value
Corr = {}
for c in numFeatures:
    # since some values might be 0s, I use x+1 to avoid missing some important relations
    C1,P1=np.nanmean([pearsonr(Tar_tr[inx[k]],   (1+Ttr[c].iloc[inx[k]])) for k in range(n)], 0)
    C2,P2=np.nanmean([pearsonr(Tar_tr[inx[k]], 1/(1+Ttr[c].iloc[inx[k]])) for k in range(n)], 0)
    if P2<P1:
        T2[c] = 1/(1+T2[c])
        Corr[c] = [C2,P2]
    else:
        T2[c] = 1+T2[c]
        Corr[c] = [C1,P1]
        
        
polyCol = []
thrP = 0.01
thrC = 0.02
print('columns \t\t\t Corr1 \t\t Corr2 \t\t Corr Combined')
for i, c1 in enumerate(numFeatures[:-1]):
    C1, P1 = Corr[c1]
    for c2 in numFeatures[i+1:]:
        C2, P2 = Corr[c2]
        V = T2[c1] * T2[c2]
        Vtr = V[-pd.isna(T2.project_is_approved)].values
        C, P = pl.nanmean([pearsonr(Tar_tr[inx[k]], Vtr[inx[k]]) for k in range(n)], 0)
        if P<thrP and abs(C) - max(abs(C1),abs(C2)) > thrC:
            df[c1+'_'+c2+'_poly'] = V
            
            

  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


columns 			 Corr1 		 Corr2 		 Corr Combined




CPU times: user 3 s, sys: 264 ms, total: 3.26 s
Wall time: 2.69 s


In [123]:
df

Unnamed: 0,id,project_essay_1,project_essay_2,project_grade_category,project_is_approved,project_resource_summary,project_subject_categories,project_subject_subcategories,project_submitted_datetime,project_title,...,teacher_number_of_previously_posted_projects_stat,price_avg_price_poly,price_price_mean_poly,total_price_avg_price_poly,total_price_total_price_max_poly,total_price_price_mean_poly,total_price_total_price_mean_poly,avg_price_price_max_poly,price_max_price_mean_poly,total_price_total_price_min_poly
0,p036502,Most of my kindergarten students come from low...,I currently have a differentiated sight word c...,Grades PreK-2,1.0,My students need 6 Ipod Nano's to create and d...,Literacy & Language,Literacy,2016-11-18 14:45:59,Super Sight Word Centers,...,0.003960,0.501661,0.501661,0.167592,0.500555,0.167592,0.500555,1.000000,1.000000,0.500555
1,p039565,Our elementary school is a culturally rich sch...,We strive to provide our diverse population of...,Grades 3-5,0.0,My students need matching shirts to wear for d...,"Music & The Arts, Health & Sports","Performing Arts, Team Sports",2017-04-26 15:57:28,Keep Calm and Dance On,...,0.146735,1.000000,1.000000,0.052369,1.000000,0.052369,1.000000,1.000000,1.000000,1.000000
2,p233823,Hello;\r\nMy name is Mrs. Brotherton. I teach ...,We are looking to add some 3Doodler to our cla...,Grades 3-5,1.0,My students need the 3doodler. We are an SEM s...,"Math & Science, Literacy & Language","Applied Sciences, Literature & Writing",2017-01-01 22:57:44,Lets 3Doodle to Learn,...,0.037876,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
3,p185307,My students are the greatest students but are ...,"The student's project which is totally \""kid-i...",Grades 3-5,0.0,My students need balls and other activity equi...,Health & Sports,Health & Wellness,2016-08-12 15:42:11,"\""Kid Inspired\"" Equipment to Increase Activit...",...,0.008004,0.201167,0.201167,0.201167,0.519337,0.201167,0.201167,0.387354,0.387354,0.029104
4,p013780,My students are athletes and students who are ...,For some reason in our kitchen the water comes...,Grades 6-8,1.0,My students need a water filtration system for...,Health & Sports,Health & Wellness,2016-08-06 09:09:11,We need clean water for our culinary arts class!,...,0.001749,1.000000,1.000000,0.500702,1.000000,0.500702,1.000000,1.000000,1.000000,1.000000
5,p063374,My kids tell me each day that they want to mak...,I started a program called Telementoring in ho...,Grades PreK-2,1.0,My students need tablets in order to communic...,"Applied Learning, Literacy & Language","Character Education, Literature & Writing",2016-11-05 10:01:51,Need to Reach Our Virtual Mentors!!!,...,0.274713,0.502388,0.502394,0.144034,0.571970,0.144036,0.500686,0.999891,0.999905,0.429403
6,p103285,Kindergarten is the new first grade. My studen...,With balance discs and stools as flexible seat...,Grades PreK-2,1.0,My students need stability stools and inflatab...,Health & Sports,Health & Wellness,2016-08-31 00:30:43,Active Kindergartners,...,0.146735,0.625030,0.504464,0.168675,0.927763,0.136138,0.501205,0.721609,0.582414,0.074647
7,p181781,First graders are fantastic! They are excited ...,First graders love learning! We need 6 wiggle-...,Grades PreK-2,1.0,My students need wiggle stools to allow them t...,"Applied Learning, Literacy & Language","Early Development, Literature & Writing",2016-08-03 13:26:01,Fabulous Firsties-Wiggling to Learn!,...,0.274713,1.000000,1.000000,0.168671,1.000000,0.168671,1.000000,1.000000,1.000000,1.000000
8,p114989,My seventh graders dream big. They can't wait ...,I have used alternative seating in my classroo...,Grades 6-8,1.0,My students need seating that allows the most ...,Math & Science,Mathematics,2016-09-13 22:35:57,Wobble Chairs Help Fidgety Kids Focus,...,0.010426,1.000000,1.000000,0.252338,1.000000,0.252338,1.000000,1.000000,1.000000,1.000000
9,p191410,I teach first grade in a small farming town in...,There is nothing better than snuggling up with...,Grades PreK-2,1.0,My students need 2 youth sized reclining chair...,Literacy & Language,Literacy,2016-09-24 18:38:59,Snuggle Up With A Good Book,...,0.012379,1.000000,1.000000,0.504140,1.000000,0.504140,1.000000,1.000000,1.000000,1.000000
