# Mineração de Dados
### Março 2016

In [1]:
import sys
sys.path.append("../pylinguistics/pylinguistics/")

import Pylinguistics as pl
import pandas as pd
import plotsfunc as pf
import numpy as np
from scipy.stats import spearmanr

%matplotlib inline

## Cálculo de Correlação
#### baseado em https://github.com/nim4n/genomic_data_mining/blob/master/correlation_calculation.py

In [2]:
def helpf(x): 
    try:
        return float ( float(x['thumbsup']) / (float(x['thumbsup']) + float(x['thumbsdown'])) )
    except:
        return 0
    
def fillnanup(x): 
    try:
        return float(x['thumbsup'])
    except:
        return 0
    
def fillnandown(x): 
    try:
        return float(x['thumbsdown'])
    except:
        return 0
    
def thumbsSum(x): 
    try:
        return int(x['thumbsup']) + int(x['thumbsdown']) 
    except:
        return 0

In [9]:
reviews_features = pd.read_csv('reviews.csv.gz', compression='gzip')

# create new dict for saving correlation between column and helpfulness
pearson_helpfull_list = []
spearmanr_helpfull_list = []
pearson_up_list = []
spearmanr_up_list = []
pearson_down_list = []
spearmanr_down_list = []

# fix thumbs columns
reviews_columns = reviews_features.select_dtypes(include=['float64','int']).columns
reviews_features['thumbsup'] = reviews_features.apply(fillnanup,axis=1)
reviews_features['thumbsdown'] = reviews_features.apply(fillnandown,axis=1)

# adiciona coluna de helpfulness no corpus
reviews_features['helpfulness'] = reviews_features.apply(helpf,axis=1)
reviews_features['thumbstotal'] = reviews_features.apply(thumbsSum,axis=1)
reviews_features = reviews_features[reviews_features.thumbstotal>4]
reviews_features = reviews_features[np.isfinite(reviews_features.helpfulness)]

## para usar no weka
## reviews_features.to_csv('reviews-help.csv',encoding='utf-8')

reviews_features.shape

(9696, 49)

In [4]:
# calcula correlacoes
for i in reviews_columns:
    pearson_helpfull_list.append(reviews_features.helpfulness.corr(reviews_features[i]))
    spearmanr_helpfull_list.append(spearmanr(reviews_features.helpfulness, reviews_features[i])[0])
    pearson_up_list.append(reviews_features.thumbsup.corr(reviews_features[i]))
    spearmanr_up_list.append(spearmanr(reviews_features.thumbsup, reviews_features[i])[0])
    pearson_down_list.append(reviews_features.thumbsdown.corr(reviews_features[i]))
    spearmanr_down_list.append(spearmanr(reviews_features.thumbsdown, reviews_features[i])[0])
    
corr_df = pd.DataFrame(data={'pearson_helpfull': np.array(pearson_helpfull_list)}, index=reviews_columns)
corr_df['spearmanr_helpfull'] = spearmanr_helpfull_list
corr_df['pearson_up'] = pearson_up_list
corr_df['spearmanr_up'] = spearmanr_up_list
corr_df['pearson_down'] = pearson_down_list
corr_df['spearmanr_down'] = spearmanr_down_list

#saving pearson correlation in a numpy file for future use
#np.save('correlation.npy', corr_df)

In [5]:
corr_df.sort_values('pearson_helpfull',ascending=False).head(20)

Unnamed: 0,pearson_helpfull,spearmanr_helpfull,pearson_up,spearmanr_up,pearson_down,spearmanr_down
sentence_count,0.233848,0.272718,0.190857,0.287981,-0.035178,-0.134337
syllable_count,0.232391,0.296291,0.194409,0.323703,-0.039854,-0.141024
functionalIncidence,0.110437,0.06883,0.084086,0.084159,-0.018186,-0.029111
percentile_90_sentence_length,0.098786,0.13766,0.077552,0.158846,-0.021252,-0.062589
percentile_90_word_length,0.075885,0.088339,0.028634,0.074606,-0.026484,-0.055376
percentile_75_sentence_length,0.060591,0.090048,0.046828,0.109611,-0.013834,-0.040199
ConnectiveTemporalIncidence,0.051017,0.098355,0.01362,0.086583,-0.017233,-0.057675
percentile_75_word_length,0.045398,0.058189,0.026592,0.054365,-0.009669,-0.036048
ConnectiveCasualIncidence,0.042612,0.09068,0.010783,0.07989,-0.014929,-0.053222
LogicOperatorsIncidence,0.030953,0.116469,0.006677,0.103053,-0.009675,-0.068792


## Seleção de Atributos
#### basedo em http://scikit-learn.org/stable/modules/feature_selection.html

In [6]:
# split class and features
labels = reviews_features["helpfulness"].values
del reviews_features['helpfulness']
features = reviews_features[list(reviews_columns)].values

# clean features
features = np.abs(features)
features[np.isnan(features) == True] = 0

features.shape

(9696, 34)

### SelectKBest: Univariate feature selection

In [7]:
# add references
from sklearn.feature_selection import SelectKBest, f_regression

new_features = SelectKBest(f_regression, k=15).fit_transform(features, labels)
new_features.shape 

(9696, 15)

In [8]:
reviews_columns = reviews_features.select_dtypes(include=['float64','int']).columns
kbest_columns = []

for value in new_features[0]:
    for column in reviews_columns:
        if reviews_features[column].iloc[0] == value:
            kbest_columns.append(column)
            
kbest_columns

['percentile_75_sentence_length',
 'percentile_90_word_length',
 'stars',
 'syllable_count',
 'percentile_90_sentence_length',
 'pronIncidence',
 'ConnectiveAdditiveIncidence',
 'ConnectiveLogicIncidence',
 'LogicIfIncidence',
 'LogicAndIncidence',
 'LogicOperatorsIncidence',
 'percentile_25_word_length',
 'LogicNegationIncidence',
 'ConnectiveTemporalIncidence',
 'ConnectiveCasualIncidence',
 'ContentDiversty',
 'verbIncidence',
 'functionalIncidence',
 'contentIncidence',
 'ConnectiveAdditiveIncidence',
 'ConnectiveLogicIncidence',
 'LogicIfIncidence',
 'LogicAndIncidence',
 'LogicOperatorsIncidence',
 'percentile_25_word_length',
 'LogicNegationIncidence',
 'ConnectiveTemporalIncidence',
 'ConnectiveCasualIncidence',
 'sentence_count',
 'advIncidence',
 'LexicalDiversty',
 'adpPronRatio']