# Mineração de Dados
### Março 2016

In [3]:
import sys
sys.path.append("../pylinguistics/pylinguistics/")

import Pylinguistics as pl
import pandas as pd
import plotsfunc as pf
import numpy as np
from scipy.stats import spearmanr

%matplotlib inline

## Cálculo de Correlação
#### baseado em https://github.com/nim4n/genomic_data_mining/blob/master/correlation_calculation.py

In [10]:
def helpf(x): 
    try:
        return float ( float(x['thumbsup']) / (float(x['thumbsup']) + float(x['thumbsdown'])) )
    except:
        return 0
    
def fillnanup(x): 
    try:
        return float(x['thumbsup'])
    except:
        return 0
    
def fillnandown(x): 
    try:
        return float(x['thumbsdown'])
    except:
        return 0

In [66]:
reviews_features = pd.read_csv('reviews.csv.gz', compression='gzip')

# create new dict for saving correlation between column and helpfulness
pearson_helpfull_list = []
spearmanr_helpfull_list = []
pearson_up_list = []
spearmanr_up_list = []
pearson_down_list = []
spearmanr_down_list = []

# fix thumbs columns
reviews_features['thumbsup'] = reviews_features.apply(fillnanup,axis=1)
reviews_features['thumbsdown'] = reviews_features.apply(fillnandown,axis=1)
reviews_columns = reviews_features.select_dtypes(include=['float64','int']).columns

# adiciona coluna de helpfulness no corpus
reviews_features['helpfulness'] = reviews_features.apply(helpf,axis=1)

# calcula correlacoes
for i in reviews_columns:
    pearson_helpfull_list.append(reviews_features.helpfulness.corr(reviews_features[i]))
    spearmanr_helpfull_list.append(spearmanr(reviews_features.helpfulness, reviews_features[i])[0])
    pearson_up_list.append(reviews_features.thumbsup.corr(reviews_features[i]))
    spearmanr_up_list.append(spearmanr(reviews_features.thumbsup, reviews_features[i])[0])
    pearson_down_list.append(reviews_features.thumbsdown.corr(reviews_features[i]))
    spearmanr_down_list.append(spearmanr(reviews_features.thumbsdown, reviews_features[i])[0])
    
corr_df = pd.DataFrame(data={'pearson_helpfull': np.array(pearson_helpfull_list)}, index=reviews_columns)
corr_df['spearmanr_helpfull'] = spearmanr_helpfull_list
corr_df['pearson_up'] = pearson_up_list
corr_df['spearmanr_up'] = spearmanr_up_list
corr_df['pearson_down'] = pearson_down_list
corr_df['spearmanr_down'] = spearmanr_down_list

#saving pearson correlation in a numpy file for future use
np.save('correlation.npy', corr_df)

In [67]:
corr_df.sort_values('pearson_helpfull',ascending=False).head(20)

Unnamed: 0,pearson_helpfull,spearmanr_helpfull,pearson_up,spearmanr_up,pearson_down,spearmanr_down
sentence_count,0.141733,0.10546,0.206188,0.285767,0.04684,-0.002055
syllable_count,0.134348,0.098542,0.218057,0.33094,0.053729,0.020001
thumbsup,0.111583,0.213016,1.0,1.0,0.532544,0.172691
functionalIncidence,0.067938,0.027364,0.072279,0.091068,0.016081,0.006552
percentile_90_sentence_length,0.050461,0.042376,0.097291,0.195694,0.03193,0.029446
ConnectiveTemporalIncidence,0.030754,0.039276,0.013497,0.111303,-0.004567,0.001193
percentile_90_word_length,0.0292,0.062258,0.008188,0.074663,-0.006464,-0.03743
percentile_75_word_length,0.027247,0.037398,0.010063,0.028726,-0.010924,-0.02969
percentile_75_sentence_length,0.026271,0.024356,0.06527,0.147194,0.025491,0.029419
ConnectiveCasualIncidence,0.023214,0.034393,0.011033,0.10716,-0.003066,0.004731


## Seleção de Atributos
#### basedo em http://scikit-learn.org/stable/modules/feature_selection.html

In [13]:
def helpInt(x): 
    try:
        return int ( float(x['thumbsup']) / (float(x['thumbsup']) + float(x['thumbsdown'])) )
    except:
        return 0

In [14]:
# reset variables
reviews_features = pd.read_csv('reviews.csv.gz', compression='gzip')
reviews_columns = reviews_features.select_dtypes(include=['float64','int']).columns
reviews_features['helpfulness'] = reviews_features.apply(helpInt,axis=1)

# split class and features
labels = reviews_features["helpfulness"].values
del reviews_features['helpfulness']
features = reviews_features[list(reviews_columns)].values

# clean features
features = np.abs(features)
features[np.isnan(features) == True] = 0

features.shape

(32226, 34)

### SelectKBest: Univariate feature selection

In [15]:
# add references
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

new_features = SelectKBest(chi2, k=10).fit_transform(features, labels)
new_features.shape 

(32226, 10)

In [16]:
reviews_columns = reviews_features.select_dtypes(include=['float64','int']).columns
kbest_columns = []

for value in new_features[0]:
    for column in reviews_columns:
        if reviews_features[column].iloc[0] == value:
            kbest_columns.append(column)
            
kbest_columns

['mean_sentence_length',
 'syllable_count',
 'avg_word_per_sentence',
 'percentile_50_sentence_length',
 'median_sentence_length',
 'median_word_length',
 'percentile_50_sentence_length',
 'median_sentence_length',
 'median_word_length',
 'percentile_50_sentence_length',
 'median_sentence_length',
 'median_word_length',
 'pronIncidence',
 'verbIncidence',
 'adjectiveIncidence',
 'verbIncidence',
 'adjectiveIncidence',
 'advIncidence']

### LinearSVC: L1-based feature selection¶

In [52]:
# add references
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(features, labels)
model = SelectFromModel(lsvc, prefit=True)
new_features = model.transform(features)
new_features.shape

(32226, 32)

### ExtraTreesClassifier: Tree-based feature selection¶

In [53]:
# add references
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

clf = ExtraTreesClassifier()
clf = clf.fit(features, labels)
model = SelectFromModel(clf, prefit=True)
new_features = model.transform(features)
new_features.shape 

(32226, 2)