In [2]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from scipy.stats import ks_2samp
from scipy.special import stdtr


def helpFloat(x): 
    try:
        return float ( float(x['thumbsup']) / (float(x['thumbsup']) + float(x['thumbsdown'])) )
    except:
        return 0
    
def thumbsSum(x): 
    try:
        return int(x['thumbsup']) + int(x['thumbsdown']) 
    except:
        return 0

def sentiment(x): 
    if x['stars'] == 5:
        return 'favorable'
    if x['stars'] == 1:
        return 'unfavorable'
    else:
        return 'mixed'
    
reviews_features = pd.read_csv('reviews.csv.gz', compression='gzip')
reviews_features['helpfulness'] = reviews_features.apply(helpFloat,axis=1)
reviews_features['thumbstotal'] = reviews_features.apply(thumbsSum,axis=1)
reviews_features = reviews_features[reviews_features.thumbstotal>4]
reviews_features = reviews_features[reviews_features.word_count>9]
reviews_features = reviews_features[np.isfinite(reviews_features.helpfulness)]
list_search_categories = [ 'Celular e Smartphone' , 'TV', 'Camera Digital', 'Maquina de Lavar Roupas'
                          , 'Geladeira / Refrigerador', 'Ar Condicionado', 'Tablet', 'Notebook'
                          ,'Console de Videogame', 'Impressora', 'Fogao', 'Microondas'
                          , 'Aparelho de Telefone', 'MP3 Player / MP4 Player', 'Aquecedor de Ambiente'
                          , 'Forno Eletrico']
reviews_features = reviews_features[reviews_features.category.isin(list_search_categories)]
reviews_features.shape

(7444, 49)

## split dataset sentiment

In [4]:
mixed_reviews = reviews_features[reviews_features.stars > 1]
mixed_reviews = mixed_reviews[reviews_features.stars < 5]
favorable_reviews = reviews_features[reviews_features.stars == 5]
unfavorable_reviews = reviews_features[reviews_features.stars == 1]

## t-student test

In [8]:
def tstudent(df1, df2, m_list, p_list):
    col =list(df1)
    count=1
    for i in xrange(len(col)-1): 
        try:
            # Create sample data.
            a = df1[col[i]].as_matrix()
            b = df2[col[i]].as_matrix()
            # Use scipy.stats.ttest_ind.
            t, p = ttest_ind(a, b, equal_var=False)
            m_list.append(np.mean(a))
            p_list.append("%.4g" % p)
            if (p <= 0.05):
                #print p
                if (np.mean(a) > np.mean(b)):
                    dife="scientific > non_scientific"
                else:
                    dife="scientific < non_scientific"

                print "%i. %s t = %f  p = %.4g  %s" % (count, dife,  t, p, col[i])

                count=count+1
            else:
                print "IGUAL %s p=%f" %(col[i],p)
        except:
            m_list.append(0)
            p_list.append(0)
            print "ERROR %s " %(col[i])

## favorable x unfavorable

In [9]:
fav_m = []
fav_unfav_p = []
tstudent(favorable_reviews, unfavorable_reviews, fav_m, fav_unfav_p)

1. scientific > non_scientific t = 6.192726  p = 9.948e-10  Unnamed: 0
IGUAL ConnectiveAdditiveIncidence p=0.734745
IGUAL sentence_sized_30 p=0.223197
IGUAL redability p=0.814728
IGUAL mean_sentence_length p=0.171871
2. scientific < non_scientific t = -2.004392  p = 0.04542  percentile_75_sentence_length
3. scientific > non_scientific t = 2.470721  p = 0.01367  percentile_75_word_length
4. scientific < non_scientific t = -9.229879  p = 3.103e-19  word_count
IGUAL ConnectiveLogicIncidence p=0.482387
IGUAL percentile_90_word_length p=0.204028
5. scientific > non_scientific t = 2.675829  p = 0.007563  mean_word_length
6. scientific < non_scientific t = -9.218459  p = 3.62e-19  syllable_count
IGUAL avg_word_per_sentence p=0.213321
IGUAL LogicIfIncidence p=nan
IGUAL percentile_50_sentence_length p=0.571777
IGUAL median_sentence_length p=0.571777
IGUAL LogicAndIncidence p=0.482387
7. scientific < non_scientific t = -3.481081  p = 0.0005303  percentile_90_sentence_length
IGUAL median_word_len

## favorable x mixed

In [10]:
mix_m = []
fav_mix_p = []
tstudent(mixed_reviews, favorable_reviews, mix_m, fav_mix_p)

IGUAL Unnamed: 0 p=0.507477
IGUAL ConnectiveAdditiveIncidence p=0.400764
1. scientific > non_scientific t = 3.690984  p = 0.0002277  sentence_sized_30
IGUAL redability p=0.353477
2. scientific > non_scientific t = 2.892443  p = 0.003853  mean_sentence_length
3. scientific > non_scientific t = 3.930128  p = 8.705e-05  percentile_75_sentence_length
IGUAL percentile_75_word_length p=0.262711
4. scientific > non_scientific t = 14.218311  p = 1.282e-44  word_count
IGUAL ConnectiveLogicIncidence p=0.713366
IGUAL percentile_90_word_length p=0.591007
5. scientific < non_scientific t = -2.630640  p = 0.008581  mean_word_length
6. scientific > non_scientific t = 14.626015  p = 4.428e-47  syllable_count
7. scientific > non_scientific t = 2.003444  p = 0.04524  avg_word_per_sentence
IGUAL LogicIfIncidence p=nan
8. scientific > non_scientific t = 2.010115  p = 0.04452  percentile_50_sentence_length
9. scientific > non_scientific t = 2.010115  p = 0.04452  median_sentence_length
IGUAL LogicAndIncide

## unfavorable x mixed

In [11]:
unfav_m = []
unfav_mix_p = []
tstudent(unfavorable_reviews, mixed_reviews, unfav_m, unfav_mix_p)

1. scientific < non_scientific t = -6.231560  p = 8.977e-10  Unnamed: 0
IGUAL ConnectiveAdditiveIncidence p=0.496898
IGUAL sentence_sized_30 p=0.294586
IGUAL redability p=0.858921
IGUAL mean_sentence_length p=0.872871
IGUAL percentile_75_sentence_length p=0.797118
2. scientific < non_scientific t = -2.029472  p = 0.04286  percentile_75_word_length
3. scientific > non_scientific t = 2.114147  p = 0.03492  word_count
IGUAL ConnectiveLogicIncidence p=0.593622
IGUAL percentile_90_word_length p=0.068819
IGUAL mean_word_length p=0.255000
4. scientific > non_scientific t = 2.059502  p = 0.03988  syllable_count
IGUAL avg_word_per_sentence p=0.685544
IGUAL LogicIfIncidence p=nan
IGUAL percentile_50_sentence_length p=0.769062
IGUAL median_sentence_length p=0.769062
IGUAL LogicAndIncidence p=0.618282
IGUAL percentile_90_sentence_length p=0.331119
IGUAL median_word_length p=0.769062
IGUAL pronIncidence p=0.318726
5. scientific > non_scientific t = 2.894074  p = 0.003951  LogicOperatorsIncidence
IG

In [14]:
df_columns = mixed_reviews.columns.drop('thumbstotal')
corr_df = pd.DataFrame(data={'fav_m': np.array(fav_m)}, index=df_columns)
corr_df['mix_m'] = mix_m
corr_df['unfav_m'] = unfav_m
corr_df['fav_unfav_p'] = fav_unfav_p
corr_df['fav_mix_p'] = fav_mix_p
corr_df['unfav_mix_p'] = unfav_mix_p
corr_df.sort_values('fav_unfav_p',ascending=False)

Unnamed: 0,fav_m,mix_m,unfav_m,fav_unfav_p,fav_mix_p,unfav_mix_p
LogicIfIncidence,0.0,0.0,0.0,,,
LogicOperatorsIncidence,4.500335,5.225486,6.761931,9.986e-05,0.01572,0.003951
Unnamed: 0,15393.630852,15231.841951,12350.230612,9.948e-10,0.5075,8.977e-10
advIncidence,76.059182,80.819465,84.741067,9.704e-05,0.0004666,0.04419
LogicNegationIncidence,4.473877,5.208345,6.752654,8.763e-05,0.01415,0.003764
functionalIncidence,311.297489,313.066466,327.296345,7.654e-05,0.4689,5.282e-05
contentIncidence,599.198876,596.12134,582.545003,5.79e-07,0.1339,2.313e-06
ContentDiversty,0.834982,0.80921,0.778079,5.425e-16,6.784e-14,1.01e-06
sentence_count,3.779112,5.094203,5.722449,3.918e-19,1.479e-39,0.002233
syllable_count,122.363145,173.000783,186.836735,3.62e-19,4.427999999999999e-47,0.03988
