In [1]:
# First import all packages
################################################################################

# For reading and wrangling data
import pandas as pd # for reading data frames
import regex as re  # for data cleaning
import nltk         # for sentiment

# For sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
nltk.download("vader_lexicon")

# For regressions
import numpy as np
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import statsmodels.api as sm


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/professorf/nltk_data...


In [6]:
# Test: Sentiment package and word counts
################################################################################
sia = SentimentIntensityAnalyzer()

# Example sentiment scores for positive text
text = "I love this movie! So awesome. Best ever."
score = sia.polarity_scores(text)
print(text)
print(f'Positive Sentiment Score: {score['pos']}')
print(f'Negative Sentiment Score: {score['neg']}')
print(f'Neutral Sentiment Score: {score['neg']}') 
print(f'Compound Sentiment Score: {score['compound']}') 

# Example sentiment scores for negative text
text = "I absolutely hate this movie!"
score = sia.polarity_scores(text)
print(f'\n{text}')
print(f'Positive Sentiment Score: {score['pos']}') 
print(f'Negative Sentiment Score: {score['neg']}') 
print(f'Neutral Sentiment Score: {score['neg']}') 
print(f'Compound Sentiment Score: {score['compound']}') 

# Example sentiment scores for neutral text
text = "This    movie was both    good and bad."
score = sia.polarity_scores(text)
print(f'\n{text}')
print(f'Positive Sentiment Score: {score['pos']}') 
print(f'Negative Sentiment Score: {score['neg']}') 
print(f'Neutral Sentiment Score: {score['neg']}') 
print(f'Compound Sentiment Score: {score['compound']}') 

# Example how to remove extraneous spaces and count words
clean_text = re.sub(r'\s{2,}', ' ', text) # multiple space to one space
print(f'\nOriginal: {text}\nClean: {clean_text}')
word_count = len(clean_text.split(' '))
print(f'Post length: {word_count}')


I love this movie! So awesome. Best ever.
Positive Sentiment Score: 0.77
Negative Sentiment Score: 0.0
Neutral Sentiment Score: 0.0
Compound Sentiment Score: 0.9367

I absolutely hate this movie!
Positive Sentiment Score: 0.0
Negative Sentiment Score: 0.588
Neutral Sentiment Score: 0.588
Compound Sentiment Score: -0.6468

This    movie was both    good and bad.
Positive Sentiment Score: 0.254
Negative Sentiment Score: 0.307
Neutral Sentiment Score: 0.307
Compound Sentiment Score: -0.1531

Original: This    movie was both    good and bad.
Clean: This movie was both good and bad.
Post length: 7


In [8]:
# Read in data set
################################################################################
filename = 'IAM42.csv'
foldpath = '.' # . is folder of notebook. Note: no trailing slash
filepath = f'{foldpath}/{filename}' 
print(filepath)
df = pd.read_csv(filepath)

df.columns

./IAM42.csv


Index(['user', 'text'], dtype='object')

In [10]:
# Process for the independent variable (sentiment)
################################################################################
post_sentiments = [sia.polarity_scores(post)['compound'] for post in df.text]

word_counts     = [len((re.sub(r'/s{2,}', ' ', post)).split(' ')) for post in df.text]

print(post_sentiments, word_counts)

[0.9804, 0.9912, 0.9857, 0.9899, 0.9622, 0.969, 0.9753, 0.9702, 0.8902, 0.706, -0.4288, 0.0, 0.3612, 0.9396, 0.9849, 0.9688, 0.9897, 0.9663, 0.75, 0.5927, 0.9849, -0.3822, -0.4088, 0.9719, 0.9879, 0.0, 0.3816, 0.8704, 0.7783, 0.8429, 0.9623, 0.9084, -0.4676, 0.9897, 0.7351, 0.956, 0.6369, 0.9801, 0.775, 0.7783, 0.3488, 0.9735] [303, 425, 259, 347, 124, 233, 352, 368, 220, 357, 180, 85, 38, 324, 305, 152, 172, 198, 91, 148, 322, 147, 140, 89, 739, 63, 89, 90, 124, 71, 375, 262, 390, 378, 80, 269, 91, 211, 124, 220, 178, 248]


In [11]:
# Do regression analysis
################################################################################

(post_sentiments_train, post_sentiments_test, 
 word_counts_train, word_counts_test) = train_test_split(
     post_sentiments, 
     word_counts, 
     test_size = .2, 
     random_state = 0)

model = LinearRegression()

# Do this if you are doing traditional ML modeling
X = np.array(post_sentiments_train).reshape(-1,1) # if single var regression
Y = np.array(word_counts_train)

# For research modeling, use the entire data set
X = np.array(post_sentiments).reshape(-1,1)
Y = np.array(word_counts).reshape(-1,1)
model.fit(X, Y)

print(model.intercept_)
print(model.coef_)
print(model.score(X, Y))

[160.31460757]
[[90.83969967]]
0.08781152479494325


In [9]:
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
est2.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.088
Model:,OLS,Adj. R-squared:,0.065
Method:,Least Squares,F-statistic:,3.851
Date:,"Thu, 01 Aug 2024",Prob (F-statistic):,0.0567
Time:,05:23:57,Log-Likelihood:,-263.61
No. Observations:,42,AIC:,531.2
Df Residuals:,40,BIC:,534.7
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,160.3146,38.030,4.215,0.000,83.453,237.176
x1,90.8397,46.293,1.962,0.057,-2.721,184.401

0,1,2,3
Omnibus:,19.405,Durbin-Watson:,2.076
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28.883
Skew:,1.359,Prob(JB):,5.35e-07
Kurtosis:,6.019,Cond. No.,3.53


_The result was found to be marginally significant at the .10 level (p = .057)._

In [12]:
# a 3 predictor example

post_neutrals  = [sia.polarity_scores(post)['neu']      for post in df.text]
post_negatives = [sia.polarity_scores(post)['neg']      for post in df.text]
post_positives = [sia.polarity_scores(post)['pos']      for post in df.text]
post_compounds = [sia.polarity_scores(post)['compound'] for post in df.text]
df_xy = pd.DataFrame({
    'pos' : post_positives,
    'neg' : post_negatives,
    'neu' : post_neutrals,
    'com' : post_compounds,
    'wc'  : word_counts
})

X = df_xy[['pos', 'neu', 'com']]
Y = df_xy[['wc']]

model = LinearRegression()
model.fit(X, Y)

print(model.intercept_)
print(model.coef_)
print(model.score(X, Y))

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
est2.summary()

[2864.60115932]
[[-3586.93372317 -2791.23992754   226.13539334]]
0.36657913034180833


0,1,2,3
Dep. Variable:,wc,R-squared:,0.367
Model:,OLS,Adj. R-squared:,0.317
Method:,Least Squares,F-statistic:,7.331
Date:,"Mon, 05 Aug 2024",Prob (F-statistic):,0.00054
Time:,14:33:36,Log-Likelihood:,-255.95
No. Observations:,42,AIC:,519.9
Df Residuals:,38,BIC:,526.9
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2864.6012,705.333,4.061,0.000,1436.729,4292.473
pos,-3586.9337,880.482,-4.074,0.000,-5369.376,-1804.491
neu,-2791.2399,737.526,-3.785,0.001,-4284.283,-1298.197
com,226.1354,53.069,4.261,0.000,118.702,333.569

0,1,2,3
Omnibus:,23.87,Durbin-Watson:,1.756
Prob(Omnibus):,0.0,Jarque-Bera (JB):,42.325
Skew:,1.579,Prob(JB):,6.45e-10
Kurtosis:,6.77,Cond. No.,114.0


In [15]:
# measure the predictors (independent variables)
scholarly_tone  = [sia.polarity_scores(post)['neu']      for post in df.text]
praise          = [sia.polarity_scores(post)['pos']      for post in df.text]
overall_tone    = [sia.polarity_scores(post)['compound'] for post in df.text]

# measure the outcome
post_length     = [len((re.sub(r'/s{2,}', ' ', post)).split(' ')) for post in df.text]

# Create a dataframe with the predictors and outcome
df_xy = pd.DataFrame({
    'scholarly_tone' : scholarly_tone,
    'praise'         : praise,
    'overall_tone'   : overall_tone,
    'post_length'    : post_length
})

# Assign predictors to X and outcome to Y
X  = df_xy[['scholarly_tone', 'praise', 'overall_tone']]
Y  = df_xy[['post_length']]
X2 = sm.add_constant(X) # add an extra constant for the intercept

# Perform regression
est = sm.OLS(Y, X2)     # perform regression
est2 = est.fit()        # validate regression
est2.summary()          # summarize regression

0,1,2,3
Dep. Variable:,post_length,R-squared:,0.367
Model:,OLS,Adj. R-squared:,0.317
Method:,Least Squares,F-statistic:,7.331
Date:,"Mon, 05 Aug 2024",Prob (F-statistic):,0.00054
Time:,14:37:41,Log-Likelihood:,-255.95
No. Observations:,42,AIC:,519.9
Df Residuals:,38,BIC:,526.9
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2864.6012,705.333,4.061,0.000,1436.729,4292.473
scholarly_tone,-2791.2399,737.526,-3.785,0.001,-4284.283,-1298.197
praise,-3586.9337,880.482,-4.074,0.000,-5369.376,-1804.491
overall_tone,226.1354,53.069,4.261,0.000,118.702,333.569

0,1,2,3
Omnibus:,23.87,Durbin-Watson:,1.756
Prob(Omnibus):,0.0,Jarque-Bera (JB):,42.325
Skew:,1.579,Prob(JB):,6.45e-10
Kurtosis:,6.77,Cond. No.,114.0


In [14]:
pause = 0