# Mineração de Dados
### Março 2016

In [45]:
import sys
sys.path.append("../pylinguistics/pylinguistics/")

import Pylinguistics as pl
import pandas as pd
import plotsfunc as pf
import numpy as np
from scipy.stats import spearmanr

%matplotlib inline

## Cálculo de Correlação
#### baseado em https://github.com/nim4n/genomic_data_mining/blob/master/correlation_calculation.py

In [58]:
def helpf(x): 
    try:
        return float ( float(x['thumbsup']) / (float(x['thumbsup']) + float(x['thumbsdown'])) )
    except:
        return 0

In [61]:
reviews_features = pd.read_csv('reviews.csv.gz', compression='gzip')

# create new dict for saving correlation between column and helpfulness
corr_dict = {}
pearson_list = []
spearmanr_list = []
reviews_columns = reviews_features.select_dtypes(include=['float64','int']).columns

# adiciona coluna de helpfulness no corpus
reviews_features['helpfulness'] = reviews_features.apply(helpf,axis=1)

# calcula correlacoes
for i in reviews_columns:
    corr = reviews_features.helpfulness.corr(reviews_features[i])
    pearson_list.append(corr)
    spearmanr_list.append(spearmanr(reviews_features.helpfulness, reviews_features[i])[0])
    corr_dict[i] = corr

corr_df = pd.DataFrame(data={'pearson': np.array(pearson_list)}, index=reviews_columns)
corr_df['spearmanr'] = spearmanr_list

#saving pearson correlation in a numpy file for future use
np.save('correlation.npy', corr_dict)

In [62]:
corr_df.sort_values('pearson',ascending=False).head(20)

Unnamed: 0,pearson,spearmanr
sentence_count,0.141733,0.10546
syllable_count,0.134348,0.098542
functionalIncidence,0.067938,0.027364
percentile_90_sentence_length,0.050461,0.042376
ConnectiveTemporalIncidence,0.030754,0.039276
percentile_90_word_length,0.0292,0.062258
percentile_75_word_length,0.027247,0.037398
percentile_75_sentence_length,0.026271,0.024356
ConnectiveCasualIncidence,0.023214,0.034393
adjectiveIncidence,0.022691,0.064751


## Seleção de Atributos
#### basedo em http://scikit-learn.org/stable/modules/feature_selection.html

In [118]:
def helpabs(x): 
    try:
        return int ( float(x['thumbsup']) * 10 / (float(x['thumbsup']) + float(x['thumbsdown'])) )
    except:
        return 0

In [134]:
# reset variables
reviews_features = pd.read_csv('reviews.csv.gz', compression='gzip')
reviews_columns = reviews_features.select_dtypes(include=['float64','int']).columns
reviews_features['helpfulness'] = reviews_features.apply(helpabs,axis=1)

# split class and features
labels = reviews_features["helpfulness"].values
del reviews_features['helpfulness']
features = reviews_features[list(reviews_columns)].values

# clean features
features = np.abs(features)
features[np.isnan(features) == True] = 0

features.shape

(32226, 34)

### SelectKBest: Univariate feature selection

In [135]:
# add references
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

new_features = SelectKBest(chi2, k=10).fit_transform(features, labels)
new_features.shape 

(32226, 10)

### LinearSVC: L1-based feature selection¶

In [136]:
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(features, labels)
model = SelectFromModel(lsvc, prefit=True)
new_features = model.transform(features)
new_features.shape

(32226, 30)

### ExtraTreesClassifier: Tree-based feature selection¶

In [137]:
clf = ExtraTreesClassifier()
clf = clf.fit(features, labels)
model = SelectFromModel(clf, prefit=True)
new_features = model.transform(features)
new_features.shape 

(32226, 23)