In [1]:
import numpy as np
import pandas as pd
import pickle
import gzip
import math

# Avoid restarting Kernel
%load_ext autoreload
%autoreload 2

# %autosave 50

## Configuration

In [63]:
# Subsampling
NB_SAMPLES = 20000  # up to 200k, then change the input file


200

## Get user comments

In [3]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path, trunc=0):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1        
    if trunc > 0 and i > trunc: 
        break
  return pd.DataFrame.from_dict(df, orient='index')

In [4]:
data_path = '../../datasets/'
file_name = 'reviews_Movies_and_TV.json.gz'

In [5]:
# comments_df = getDF(data_path + file_name, 200000)
# comments_df.loc[0,'reviewText']
# print(comments_df.shape)
# comments_df.head(1)

In [6]:
# Save the records
# import pickle
# pickle_out = open(data_path + "amzn_200k.pickle","wb")
# pickle.dump(comments_df, pickle_out)
# pickle_out.close()

In [7]:
# Reads records
import pickle
pickle_in = open(data_path + "amzn_200k.pickle","rb")
comments_df = pickle.load(pickle_in)

## Split comments into separate sentences

In [8]:
small = comments_df.loc[:NB_SAMPLES, :]
len(small.index)

20001

In [9]:
from nltk.tokenize import sent_tokenize
small['sentence'] = small['reviewText'].map(sent_tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [10]:
small.columns

Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime', 'sentence'],
      dtype='object')

In [11]:
small.drop(['reviewerName', 'helpful', 'reviewText', 'summary', 
            'unixReviewTime', 'reviewTime'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [12]:
small.shape

(20001, 4)

In [13]:
sentences = small['sentence'] \
.apply(pd.Series) \
.merge(small, left_index = True, right_index = True) \
.drop(['sentence'], axis = 1) \
.melt(id_vars = ['reviewerID', 'asin','overall'], value_name = 'sentence') \
.drop(['variable'], axis = 1) \
.dropna()

print(sentences.shape)
sentences.head(3)

(118281, 4)


Unnamed: 0,reviewerID,asin,overall,sentence
0,A3R5OBKS7OM2IR,143502,5.0,This has some great tips as always and is help...
1,A3R5OBKS7OM2IR,143529,5.0,This is a great pastry guide.
2,AH3QC2PC1VTGP,143561,2.0,I have to admit that I am a fan of Giada's coo...


## Sentence-level prep & cleaning

In [14]:
%reload_ext autoreload
from utils import split_n_lower, not_about_support

### Split into words and lower the case

In [15]:
sentences['words'] = sentences['sentence'].apply(lambda s: split_n_lower(s))

In [16]:
print(sentences.shape)
sentences.head(3)

(118281, 5)


Unnamed: 0,reviewerID,asin,overall,sentence,words
0,A3R5OBKS7OM2IR,143502,5.0,This has some great tips as always and is help...,"[this, has, some, great, tips, as, always, and..."
1,A3R5OBKS7OM2IR,143529,5.0,This is a great pastry guide.,"[this, is, a, great, pastry, guide, .]"
2,AH3QC2PC1VTGP,143561,2.0,I have to admit that I am a fan of Giada's coo...,"[i, have, to, admit, that, i, am, a, fan, of, ..."


### Keep support-related sentences as they probably have impact on rating

In [102]:
# on_movies_filter = [not_about_support(word) for word in sentences['words']]
sentences_on_movie = sentences #[on_movies_filter]

print('Removing {} records'.format(sentences.shape[0]- sentences_on_movie.shape[0]))

Removing 0 records


In [103]:
sentences_on_movie.shape

(118281, 5)

### Base case: A reviews with objective and subjective sentences

In [104]:
# Kernel dies here at 50K samples
all_reviews_groups = sentences_on_movie.groupby(['reviewerID','asin'])


In [105]:
all_reviews_stars = all_reviews_groups['overall'].mean()
all_reviews_stars[:3]

reviewerID             asin      
A00295401U6S2UG3RAQSZ  0767015533    4.0
A0047322388NOTO4N8SKD  0310274281    5.0
A00473363TJ8YSZ3YAGG9  0310263662    4.0
Name: overall, dtype: float64

In [106]:
all_reviews_comments = all_reviews_groups['words'].sum()
print(sentences_on_movie.iloc[0, 4])
print(all_reviews_comments.shape)
print(all_reviews_comments[0])
len(all_reviews_comments)

['this', 'has', 'some', 'great', 'tips', 'as', 'always', 'and', 'is', 'helping', 'me', 'to', 'complete', 'my', 'good', 'eats', 'collection', '.']
(19998,)
['item', 'delivered', 'on', 'time', 'and', 'well', 'packaged', ',', 'slip', 'case', 'a', 'bit', 'worn', '.', 'essential', 'diana', 'rigg', 'episodes', 'from', '1965', 'in', 'b', '&', 'w', '.', 'it', "'s", 'called', "'65", 'dvd', 'set', '2', '(', 'in', 'case', 'you', 'get', 'as', 'confused', 'as', 'i', 'do', 'with', 'the', 'chronology', 'nomenclature', ')', '.', 'volumes', '3', '&', '4', '.', 'vol', '3', ':', 'the', 'murder', 'market', ',', 'a', 'surfeit', 'of', 'h2o', ',', 'the', 'hour', 'that', 'never', 'was', '.', 'vol', '4', ':', 'dial', 'a', 'deadly', 'number', ',', 'man-eater', 'of', 'surrey', 'green', ',', 'two', "'s", 'a', 'crowd', ',', 'and', 'bonus', 'episode', 'too', 'many', 'christmas', 'trees', '.']


19998

### Remove objective sentences for case B using obj-subj model

In [107]:
# pd.set_option('display.max_colwidth', -1)
# sentences_on_movie['sentence']
sentences_on_movie.shape

(118281, 5)

In [108]:
sentences_on_movie.head(2)

Unnamed: 0,reviewerID,asin,overall,sentence,words
0,A3R5OBKS7OM2IR,143502,5.0,This has some great tips as always and is helping me to complete my Good Eats collection.,"[this, has, some, great, tips, as, always, and, is, helping, me, to, complete, my, good, eats, collection, .]"
1,A3R5OBKS7OM2IR,143529,5.0,This is a great pastry guide.,"[this, is, a, great, pastry, guide, .]"


#### Vectorize along the word space of the obj-subj training set

In [109]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = pickle.load(open('Obj-Subj_tfidf.pkl', 'rb'))
len(tfidf.vocabulary_)

20893

In [110]:
mat = tfidf.transform(sentences_on_movie['sentence']).todense()

In [111]:
mat

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

#### Apply the obj-subj model

In [112]:
N_TREES = 100
LEARN_RATE = 0.1
MIN_IN_LEAF = 10
pickle_in = open('GBC_'+ str(N_TREES) +'_' + str(LEARN_RATE) 
                        +'_' + str(MIN_IN_LEAF) + '_20min.pkl', 'rb')
gb_model = pickle.load(pickle_in)

In [114]:
y_test = gb_model.predict(mat)
len(y_test)

118281

In [115]:
subjective_sentences = sentences_on_movie[y_test == 1]

In [116]:
subjective_sentences.head(2)

Unnamed: 0,reviewerID,asin,overall,sentence,words
0,A3R5OBKS7OM2IR,143502,5.0,This has some great tips as always and is helping me to complete my Good Eats collection.,"[this, has, some, great, tips, as, always, and, is, helping, me, to, complete, my, good, eats, collection, .]"
1,A3R5OBKS7OM2IR,143529,5.0,This is a great pastry guide.,"[this, is, a, great, pastry, guide, .]"


#### Merge the sentences back into paragraph reviews

In [117]:
subj_groups = subjective_sentences.groupby(['reviewerID','asin'])
subj_reviews_stars = subj_groups['overall'].mean()
# subjective_reviewssubjective_reviews['sentence'].apply(lambda x: x.sum())
# subjective_reviews_reviews = 
subj_reviews_stars[:3]

reviewerID             asin      
A00295401U6S2UG3RAQSZ  0767015533    4.0
A00473363TJ8YSZ3YAGG9  0310263662    4.0
A017699216H6YAFBGYJOW  0740328271    5.0
Name: overall, dtype: float64

In [118]:
subj_review_comments = subj_groups['words'].sum()
print(subj_review_comments.shape)
print(subj_review_comments[0])
subj_review_comments[:3]

(19269,)
['item', 'delivered', 'on', 'time', 'and', 'well', 'packaged', ',', 'slip', 'case', 'a', 'bit', 'worn', '.', 'it', "'s", 'called', "'65", 'dvd', 'set', '2', '(', 'in', 'case', 'you', 'get', 'as', 'confused', 'as', 'i', 'do', 'with', 'the', 'chronology', 'nomenclature', ')', '.', 'volumes', '3', '&', '4', '.', 'vol', '3', ':', 'the', 'murder', 'market', ',', 'a', 'surfeit', 'of', 'h2o', ',', 'the', 'hour', 'that', 'never', 'was', '.', 'vol', '4', ':', 'dial', 'a', 'deadly', 'number', ',', 'man-eater', 'of', 'surrey', 'green', ',', 'two', "'s", 'a', 'crowd', ',', 'and', 'bonus', 'episode', 'too', 'many', 'christmas', 'trees', '.']


reviewerID             asin      
A00295401U6S2UG3RAQSZ  0767015533    [item, delivered, on, time, and, well, packaged, ,, slip, case, a, bit, worn, ., it, 's, called, '65, dvd, set, 2, (, in, case, you, get, as, confused, as, i, do, with, the, chronology, nomenclature, ), ., volumes, 3, &, 4, ., vol, 3, :, the, murder, market, ,, a, surfeit, of, h2o, ,, the, hour, that, never, was, ., vol, 4, :, dial, a, deadly, number, ,, man-eater, of, surrey, green, ,, two, 's, a, crowd, ,, and, bonus, episode, too, many, christmas, trees, .]
A00473363TJ8YSZ3YAGG9  0310263662    [good, movie, showing, the, &, #, 34, ;, passion, &, #, 34, ;, or, ending, side, of, christ, only, showing, basically, his, trial, and, crucifixion, .]                                                                                                                                                                                                                                                                                    

### Check that stars still correspond to the right movie

In [119]:
start = 6000
end = 6010
all_reviews_comments.loc[('A33Z7JTV7SSW9Y', '0718000315')]


['one',
 'of',
 'the',
 'first',
 'extended',
 'modern',
 'presentations',
 'of',
 'the',
 'life',
 'of',
 'jesus',
 'was',
 '&',
 'quot',
 ';',
 'jesus',
 'of',
 'nazareth',
 '&',
 'quot',
 ';',
 'starring',
 'robert',
 'powell',
 '.',
 'this',
 'one',
 'is',
 'the',
 'standard',
 'and',
 'measure',
 'of',
 'all',
 'other',
 'productions',
 'on',
 'the',
 'same',
 'subject',
 '.',
 'bruce',
 'marchiano',
 ',',
 'although',
 'very',
 'sincere',
 'in',
 'his',
 'efforts',
 'to',
 'portray',
 'jesus',
 ',',
 'just',
 'does',
 "n't",
 'have',
 'the',
 '&',
 'quot',
 ';',
 'gravitas',
 '&',
 'quot',
 ';',
 'of',
 'a',
 'robert',
 'powell.the',
 'one',
 'point',
 'in',
 'the',
 'visual',
 'bible',
 "'s",
 'favor',
 'is',
 'it',
 "'s",
 'dedication',
 'to',
 'following',
 'the',
 'gospel',
 'of',
 'st.',
 'matthew',
 'without',
 'any',
 'deviation',
 '.',
 '&',
 'quot',
 ';',
 'jesus',
 'of',
 'nazareth',
 '&',
 'quot',
 ';',
 'on',
 'the',
 'other',
 'hand',
 ',',
 'has',
 'the',
 'advantag

In [120]:
print(all_reviews_stars.loc[('A33Z7JTV7SSW9Y', '0718000315')])
print(sentences_on_movie.loc[sentences_on_movie['reviewerID']=='A33Z7JTV7SSW9Y']) 
# and sentences_on_movie['asin']=='0718000315'])

3.0
           reviewerID        asin  overall  \
6755   A33Z7JTV7SSW9Y  0718000315  3.0       
26756  A33Z7JTV7SSW9Y  0718000315  3.0       
46757  A33Z7JTV7SSW9Y  0718000315  3.0       
66758  A33Z7JTV7SSW9Y  0718000315  3.0       
86759  A33Z7JTV7SSW9Y  0718000315  3.0       

                                                                                                                                                                                                                                                               sentence  \
6755   One of the first extended modern presentations of the life of Jesus was &quot;Jesus of Nazareth&quot; starring Robert Powell.                                                                                                                                      
26756  This one is the standard and measure of all other productions on the same subject.                                                                                                

In [121]:
pd.options.display.max_colwidth = -1
print(small.loc[small['reviewerID']=='A33Z7JTV7SSW9Y'])

          reviewerID        asin  overall  \
6755  A33Z7JTV7SSW9Y  0718000315  3.0       

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              sentence  
6755  [One of the first extended modern presentations of the life of Jesus was &quot;Jesus of Nazareth&quot; starring Robert Powell., This one is th

In [122]:
sentences_on_movie[start:end]

Unnamed: 0,reviewerID,asin,overall,sentence,words
6001,A3ALE58N6QBFBW,061524226X,5.0,Wendy Stevens continues to Lead in our industry as she builds her empire.,"[wendy, stevens, continues, to, lead, in, our, industry, as, she, builds, her, empire, .]"
6002,A2O1W4NBZ27YHE,061524226X,5.0,"I have tried several internet marketing approaches, but Wendy Stevens' simple step-by-step approach has truly been what the doctor ordered!","[i, have, tried, several, internet, marketing, approaches, ,, but, wendy, stevens, ', simple, step-by-step, approach, has, truly, been, what, the, doctor, ordered, !]"
6003,AKKS42WDCR3W4,061524226X,5.0,We picked up our copy of Wendy's incredible DVD at her 6 Figure School in Ft. Lauderdale in early Sept.,"[we, picked, up, our, copy, of, wendy, 's, incredible, dvd, at, her, 6, figure, school, in, ft., lauderdale, in, early, sept, .]"
6005,A2G1752CKBBVGH,061524226X,5.0,"I had been held back from doing any internet marketing, especially with Google because it seemed so difficult and time consuming as well as just not really understanding how to achieve the results I wanted.","[i, had, been, held, back, from, doing, any, internet, marketing, ,, especially, with, google, because, it, seemed, so, difficult, and, time, consuming, as, well, as, just, not, really, understanding, how, to, achieve, the, results, i, wanted, .]"
6006,A22Q1R9RTP6ACF,061524226X,5.0,"Wendy Stevens guides you step by step in creative, easy to understand steps on how Google can work for you, therefore increasing your income!","[wendy, stevens, guides, you, step, by, step, in, creative, ,, easy, to, understand, steps, on, how, google, can, work, for, you, ,, therefore, increasing, your, income, !]"
6007,A39VKIWQNYPPAP,061524226X,5.0,"Have been through a few ""must have"" tutorials on marketing...alot of great information.","[have, been, through, a, few, ``, must, have, '', tutorials, on, marketing, ..., alot, of, great, information, .]"
6008,A1IUPJRSZDEB32,061524226X,5.0,A concise and easy to follow tool that puts a human face to internet marketing and sparks your creativity.,"[a, concise, and, easy, to, follow, tool, that, puts, a, human, face, to, internet, marketing, and, sparks, your, creativity, .]"
6009,A1Z5PAM8BAVY8U,061524226X,5.0,RGM 2010 is the leading edge tool box that will bring you to the fore front of WWW.,"[rgm, 2010, is, the, leading, edge, tool, box, that, will, bring, you, to, the, fore, front, of, www, .]"
6010,A28SNSQ0HHIN9L,061524226X,5.0,"I've struggled with advertising, used several tools to create campaigns but none as easy to use and as effective as this directive.","[i, 've, struggled, with, advertising, ,, used, several, tools, to, create, campaigns, but, none, as, easy, to, use, and, as, effective, as, this, directive, .]"
6011,A3CGDZYN2ZMH58,061524226X,5.0,"Wendy, thank you so much!","[wendy, ,, thank, you, so, much, !]"


## Create emotion vectors

In [123]:
print('Total number of reviews:', all_reviews_comments.shape[0])
print('Total number of subjective reviews:', subj_review_comments.shape[0])

Total number of reviews: 19998
Total number of subjective reviews: 19269


In [124]:
%reload_ext autoreload
from emotions_seven import Emotions7
emote = Emotions7()

In [125]:
all_reviews_emotions = emote.vectorize(all_reviews_comments)
print(all_reviews_emotions.shape)

(19998, 7)


In [126]:
# emote.emotions_in_text

In [127]:
all_reviews_emotions.shape

(19998, 7)

In [128]:
# all_revs_with_emotions = all_reviews_emotions[emote.emotions_in_text == True]

In [129]:
# print(all_revs_with_emotions.shape)
# all_revs_stars = all_reviews_stars[emote.emotions_in_text]
all_reviews_emotions[0]

array([0.22124355, 0.55099301, 0.23802878, 0.43633147, 0.35029095,
       0.4378464 , 0.29326086])

In [130]:
subj_reviews_emotions = emote.vectorize(subj_review_comments)
print(subj_reviews_emotions.shape)
subj_reviews_emotions[0]

(19269, 7)


array([0.21691801, 0.52631264, 0.24600442, 0.46722458, 0.33064997,
       0.46700106, 0.26401745])

## Fit a model on base case (all comments) for star rating prediction

In [131]:
from sklearn.model_selection import train_test_split

X_train, X_cv, y_train, y_cv = train_test_split(
    all_reviews_emotions, all_reviews_stars, test_size=0.2, random_state=0)

In [132]:
X_train.shape

(15998, 7)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor,GradientBoostingClassifier

# loss: deviance: logistic log likelihood

In [135]:
# Gradient Boosting Classifier parameters
N_TREES = 900 #math.floor(np.sqrt(NB_SAMPLES))
LEARN_RATE = 0.01
MIN_IN_LEAF = 3 #7
N_TREES

900

In [136]:
gbc_all = GradientBoostingClassifier(learning_rate=LEARN_RATE, 
                                n_estimators=N_TREES, 
                                min_samples_leaf=MIN_IN_LEAF,
                                random_state=0)
gbc_all.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=3, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=900,
              n_iter_no_change=None, presort='auto', random_state=0,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [141]:
gbr_all = GradientBoostingRegressor(learning_rate=LEARN_RATE, 
                                n_estimators=N_TREES, 
                                min_samples_leaf=MIN_IN_LEAF,
                                random_state=0)
gbr_all.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=3,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=900, n_iter_no_change=None, presort='auto',
             random_state=0, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [None]:
X_subj_train, X_subj_cv, y_subj_train, y_subj_cv = train_test_split(
    subj_reviews_emotions, subj_reviews_stars, test_size=0.2, random_state=0)
X_subj_train.shape

In [None]:
gbc_subj = GradientBoostingClassifier(learning_rate=LEARN_RATE, 
                                n_estimators=N_TREES, 
                                min_samples_leaf=MIN_IN_LEAF,
                                random_state=0)
gbc_subj.fit(X_subj_train, y_subj_train)

In [139]:
print('Gradient Boosting Classifier')
print('Training score using all comments: {0:.2f}'
      .format(gbc_all.score(X_train, y_train)))
print('CV score using all comments: {0:.2f}'
      .format(gbc_all.score(X_cv, y_cv)))
print('')

# print('Training score using subjective comments only: {0:.2f}'
#       .format(gbc_subj.score(X_subj_train, y_subj_train)))
# print('CV score using subjective comments only: {0:.2f}'
#       .format(gbc_subj.score(X_subj_cv, y_subj_cv)))


Gradient Boosting Classifier
Training score using all comments: 0.68
CV score using all comments: 0.66



In [142]:
print('Gradient Boosting Regressor')
print('Training score using all comments: {0:.2f}'
      .format(gbr_all.score(X_train, y_train)))
print('CV score using all comments: {0:.2f}'
      .format(gbr_all.score(X_cv, y_cv)))
print('')

Gradient Boosting Regressor
Training score using all comments: 0.12
CV score using all comments: 0.08



In [None]:
np.unique(gbc_subj.predict(X_subj_train))

## Other techniques

In [None]:
# from sklearn.linear_model import LogisticRegression
# lr = LogisticRegression(random_state=0, solver='lbfgs',
#                        multi_class='multinomial',max_iter=1000)
# lr.fit(X_subj_train, y_subj_train)
# print(lr.score(X_subj_train, y_subj_train))
# print(lr.score(X_subj_cv, y_subj_cv))

In [133]:
import statsmodels.api as sm
ols_all = sm.OLS(y_train, X_train)
results_all = ols_all.fit()
results_all.summary()

0,1,2,3
Dep. Variable:,overall,R-squared:,0.938
Model:,OLS,Adj. R-squared:,0.938
Method:,Least Squares,F-statistic:,34850.0
Date:,"Tue, 02 Jul 2019",Prob (F-statistic):,0.0
Time:,09:15:27,Log-Likelihood:,-24449.0
No. Observations:,15998,AIC:,48910.0
Df Residuals:,15991,BIC:,48970.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.4484,0.131,-3.435,0.001,-0.704,-0.193
x2,2.5880,0.070,37.052,0.000,2.451,2.725
x3,2.7924,0.120,23.196,0.000,2.556,3.028
x4,0.4966,0.080,6.218,0.000,0.340,0.653
x5,0.4493,0.092,4.874,0.000,0.269,0.630
x6,3.5053,0.050,70.616,0.000,3.408,3.603
x7,1.4830,0.081,18.228,0.000,1.324,1.642

0,1,2,3
Omnibus:,4150.914,Durbin-Watson:,2.033
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8826.222
Skew:,-1.532,Prob(JB):,0.0
Kurtosis:,4.963,Cond. No.,15.1


In [134]:
ols_subj = sm.OLS(y_subj_train, X_subj_train)
results_subj = ols_subj.fit()
results_subj.summary()

0,1,2,3
Dep. Variable:,overall,R-squared:,0.929
Model:,OLS,Adj. R-squared:,0.929
Method:,Least Squares,F-statistic:,28230.0
Date:,"Tue, 02 Jul 2019",Prob (F-statistic):,0.0
Time:,09:15:27,Log-Likelihood:,-23978.0
No. Observations:,15037,AIC:,47970.0
Df Residuals:,15030,BIC:,48020.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.4623,0.122,-3.791,0.000,-0.701,-0.223
x2,2.4421,0.064,37.992,0.000,2.316,2.568
x3,3.5840,0.080,44.755,0.000,3.427,3.741
x4,0.7920,0.077,10.339,0.000,0.642,0.942
x5,0.4521,0.086,5.243,0.000,0.283,0.621
x6,3.1508,0.047,67.357,0.000,3.059,3.243
x7,1.5227,0.076,19.965,0.000,1.373,1.672

0,1,2,3
Omnibus:,2820.497,Durbin-Watson:,2.013
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5973.457
Skew:,-1.104,Prob(JB):,0.0
Kurtosis:,5.158,Cond. No.,12.3


In [None]:
import seaborn as sns

all_reviews_emotions, all_reviews_stars

sns.heatmap(raw_df.corr(), annot=True)