<a href="https://colab.research.google.com/github/lisa11323/VR_F/blob/main/VR_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Module

In [None]:
!pip install gensim

In [None]:
!pip install pyLDAvis

In [None]:
!pip install nltk==3.8.1 --upgrade

In [None]:
!pip install vaderSentiment

In [None]:
import re
import numpy as np
import nltk
from nltk import PorterStemmer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import PunktSentenceTokenizer

punkt_tokenizer = PunktSentenceTokenizer()
lemmatizer = WordNetLemmatizer()
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')


import gensim
from gensim import corpora
from gensim.models import Phrases, CoherenceModel
from gensim.models.phrases import Phraser
from gensim.models.ldamodel import LdaModel

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import pyLDAvis

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


#Text Preprocessing

In [None]:
# Upload dataset

from google.colab import files

uploaded = files.upload()

print("Uploaded file(s):", uploaded.keys())

Saving vr_data.xlsx to vr_data.xlsx
Uploaded file(s): dict_keys(['vr_data.xlsx'])


In [None]:
import pandas as pd

data = pd.read_excel('vr_data.xlsx')
print(data.columns)

Index(['Brand', 'Link', 'contents', 'star', 'helpfulness', 'purchase', 'name',
       'date', 'helpful logit'],
      dtype='object')


In [None]:
original_len = len(data)
print(f"Rows before preprocessing: {original_len}")

Rows before preprocessing: 5656


In [None]:
def preprocess_text(text):
    if pd.isna(text):
        return None
    text = re.sub(' +', ' ', str(text))
    text = re.sub(r'\.+|\!+|\?+', '.', text)
    sentences = punkt_tokenizer.tokenize(text.lower())

    cells = []
    for cell in sentences:
        cell = re.sub(r'\d+', '', cell)
        cell = re.sub(r"\'ve", " have", cell)
        cell = re.sub(r"n\'t", " not", cell)
        cell = re.sub(r"\'ll", " will", cell)
        cell = re.sub(r"dont", "do not", cell)
        cell = re.sub(r"wont", "would not", cell)
        cell = re.sub(r"alot", "a lot", cell)
        cells.append(cell)

    return ' '.join(cells)

data['preprocessed'] = data['contents'].astype(str).apply(preprocess_text)

In [None]:
data['tokens'] = data['preprocessed'].apply(lambda x: nltk.word_tokenize(x) if pd.notnull(x) else [])
data['tokens'] = data['tokens'].apply(lambda x: [w for w in x if len(w) > 2])


# POS tagging
data['tagged'] = data['tokens'].apply(lambda x: pos_tag(x))

# Remove unnecessary POS tags
exclude_pos = {'DT', 'IN', 'CC', 'TO', 'MD', 'PRP', 'PRP$', 'UH'}
data['filtered'] = data['tagged'].apply(lambda x: [(w, p) for w, p in x if p not in exclude_pos])

# Lemmatization
nn = ['NN', 'NNS', 'NNP', 'NNPS']
vb = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
lemmatizer = WordNetLemmatizer()

data['lemmed'] = data['filtered'].apply(
    lambda x: [(lemmatizer.lemmatize(w, pos='n'), p) if p in nn else (w, p) for w, p in x]
)
data['lemmed'] = data['lemmed'].apply(
    lambda x: [(lemmatizer.lemmatize(w, pos='v'), p) if p in vb else (w, p) for w, p in x]
)

stop_en = stopwords.words('english')
stop_es = stopwords.words('spanish')
stop_fr = stopwords.words('french')
stop_de = stopwords.words('german')
stop_multi = set(stopwords.words('english') + stopwords.words('spanish') + stopwords.words('french') + stopwords.words('german'))
custom_stops = ["vr", "excellent", "love", "like", "awesome", "good", "great", "best", "perfect", "way", "use", "very", "nice",
                "wish", "super", "need", "want", "thanks", "thank", "lot", "let", "make", "people", "help", 'ist',
                'really', 'recommend', 'better', 'fun', 'cool', 'pretty', 'enjoy', 'happy', 'highly', 'definitely', 'awesome',
                'try', 'thing', 'new', 'say', 'look', 'time', 'day', 'week', 'hour','para', 'muy', 'una', 'los', 'la', 'por',
                'pero', 'del', 'como', 'las', 'más', 'poco', 'sin', 'tiene', 'todo', 'solo', 'uso', 'mucho',
                'ich', 'die', 'der', 'que', 'son', 'buy', 'est', 'pour', 'vous', 'pa', 'une', 'les', 'sur', 'mais',
                 'man', 'nicht', 'da', 'für', 'und', 'auch', 'aber', 'mit', 'das', 'den', 'auf', 'kann', 'ein', 'eine',
                'habe', 'von', 'sich', 'wie', 'bei', 'wird', 'hat', 'sie', 'mir', 'war', 'oder', 'nach', 'noch',
                'one', 'well', 'even', 'go', 'get', 'use', 'also', 'thing', 'really', 'see']
stop_multi.update(custom_stops)
remove_pos = ['MD', 'IN', 'CC', 'DT', 'EX']

data['processed_text'] = data['lemmed'].apply(lambda x: [w for w, p in x if w not in stop_multi and p not in remove_pos])

In [None]:
print(f"Total number of rows: {len(data)}")
print(f"Empty token lists: {data['processed_text'].apply(lambda x: len(x) == 0).sum()}")
print(f"Valid token lists: {data['processed_text'].apply(lambda x: len(x) > 0).sum()}")
print(f"Duplicated indices: {data.index.duplicated().sum()}개")

Total number of rows: 5656
Empty token lists: 251
Valid token lists: 5405
Duplicated indices: 0개


In [None]:
data = data[data['processed_text'].apply(lambda x: len(x) > 0)]

In [None]:
len(data)

5405

#LDA Topic Modeling

In [None]:
tokenized_doc = data['processed_text'].dropna().tolist()
tokenized_doc = [doc for doc in tokenized_doc if len(doc) > 0]

In [None]:
dictionary = corpora.Dictionary(tokenized_doc)
bow_corpus = [dictionary.doc2bow(text) for text in tokenized_doc]

In [None]:
k = 10

lda_model = LdaModel(corpus=bow_corpus,
                     id2word=dictionary,
                     num_topics=k,
                     random_state=42,
                     chunksize=100,
                     passes=10,
                     alpha='auto',
                     per_word_topics=True)

# Perplexity
perplexity = lda_model.log_perplexity(bow_corpus)

# Coherence
coherence_model = CoherenceModel(model=lda_model,
                                 texts=tokenized_doc,
                                 dictionary=dictionary,
                                 coherence='c_v')
coherence = coherence_model.get_coherence()

k_values = np.array([[perplexity, coherence]])
k_ran = [k]

print(f"Topic Num: {k}")
print(f"Perplexity: {perplexity}")
print(f"Coherence: {coherence}")

Topic Num: 10
Perplexity: -8.713052825603796
Coherence: 0.3988469157986868


In [None]:
topicn = 10
topn = 30
seed = 42

detokenized_doc = [' '.join(doc) for doc in tokenized_doc]

vectorizer = CountVectorizer(stop_words=list(stop_multi), max_df=0.3, max_features=1500, min_df=3)
X = vectorizer.fit_transform(detokenized_doc)

# Train LDA model
lda_model = LatentDirichletAllocation(n_components=topicn, learning_method='batch', random_state=seed, max_iter=20, doc_topic_prior=0.4, topic_word_prior=0.2)
lda_model.fit(X)

probs = lda_model.transform(X)
terms = vectorizer.get_feature_names_out()
beta = lda_model.components_ / lda_model.components_.sum(axis=1)[:, np.newaxis]

def get_topics(components, feature_names, n):
    for i, topic in enumerate(components):
        print(f"Topic {i+1}:", [(feature_names[j], round(topic[j], 2)) for j in topic.argsort()[:-n - 1:-1]])

print(lda_model.components_.shape)
get_topics(lda_model.components_, terms, topn)

(10, 1500)
Topic 1: [('game', 1010.81), ('play', 504.68), ('headset', 397.91), ('experience', 346.92), ('screen', 295.51), ('psvr', 271.02), ('feel', 266.49), ('motion', 255.36), ('resolution', 217.6), ('watch', 200.96), ('take', 182.87), ('effect', 175.28), ('first', 174.84), ('sony', 161.19), ('movie', 158.98), ('move', 151.68), ('sickness', 143.18), ('graphic', 134.4), ('immersive', 127.46), ('still', 113.92), ('much', 112.74), ('head', 112.56), ('set', 112.14), ('high', 106.08), ('many', 105.18), ('world', 101.63), ('demo', 101.3), ('price', 101.16), ('find', 98.03), ('quite', 97.52)]
Topic 2: [('work', 1157.37), ('issue', 579.83), ('headset', 350.15), ('problem', 257.66), ('star', 234.8), ('give', 173.63), ('update', 168.64), ('support', 166.2), ('fix', 161.14), ('software', 128.26), ('product', 124.74), ('fine', 122.15), ('never', 119.54), ('everything', 115.1), ('return', 113.73), ('know', 112.86), ('cable', 112.66), ('still', 110.19), ('find', 110.04), ('box', 107.23), ('vive',

In [None]:
import pyLDAvis
from pyLDAvis import prepare
# Document-topic distribution
doc_topic_dists = lda_model.transform(X)
# Number of words per document
doc_lengths = np.array(X.sum(axis=1)).flatten()

terms = vectorizer.get_feature_names_out()
term_frequency = np.array(X.sum(axis=0)).flatten()

topic_term_dists = lda_model.components_ / lda_model.components_.sum(axis=1)[:, np.newaxis]

# Create pyLDAvis visualization object
vis = pyLDAvis.prepare(
    topic_term_dists=topic_term_dists,
    doc_topic_dists=doc_topic_dists,
    doc_lengths=doc_lengths,
    vocab=terms,
    term_frequency=term_frequency
)

pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

# Variable Construction

In [None]:
topicn = 10

topic_columns = [f"topic{i+1}" for i in range(topicn)]

# Create dataframe of document-topic distributions
topic_df = pd.DataFrame(doc_topic_dists, columns=topic_columns)

data = data.reset_index(drop=True)
topic_df = topic_df.reset_index(drop=True)

data = pd.concat([data, topic_df], axis=1)

print(data.columns)
print(data.head())

Index(['Brand', 'Link', 'contents', 'star', 'helpfulness', 'purchase', 'name',
       'date', 'helpful logit', 'preprocessed', 'tokens', 'tagged', 'filtered',
       'lemmed', 'processed_text', 'topic1', 'topic2', 'topic3', 'topic4',
       'topic5', 'topic6', 'topic7', 'topic8', 'topic9', 'topic10'],
      dtype='object')
                        Brand  \
0  Topmaxion Google Cardboard   
1  Topmaxion Google Cardboard   
2  Topmaxion Google Cardboard   
3  Topmaxion Google Cardboard   
4  Topmaxion Google Cardboard   

                                                Link  \
0  https://www.amazon.com/Cardboard-Topmaxions-Vi...   
1  https://www.amazon.com/Cardboard-Topmaxions-Vi...   
2  https://www.amazon.com/Cardboard-Topmaxions-Vi...   
3  https://www.amazon.com/Cardboard-Topmaxions-Vi...   
4  https://www.amazon.com/Cardboard-Topmaxions-Vi...   

                                            contents                star  \
0  bought for Xmas present for a friend. love it ...  5.0 out o

In [None]:
data.isna().sum()

Unnamed: 0,0
Brand,0
Link,0
contents,2
star,0
helpfulness,3086
purchase,665
name,2
date,0
helpful logit,0
preprocessed,0


#Descriptive Statistics

In [None]:
cols = [f'topic{i}' for i in range(1, 11)] + ['helpful logit']

desc_stats = data[cols].describe(percentiles=[0.5]).T

desc_stats = desc_stats.rename(columns={
    'mean': 'Mean',
    'std': 'StdDev',
    'min': 'Min',
    '50%': 'Median',
    'max': 'Max'
})[['Mean', 'StdDev', 'Min', 'Median', 'Max']]

desc_stats = desc_stats.round(3)

print(desc_stats)

                Mean  StdDev    Min  Median    Max
topic1         0.110   0.136  0.001   0.058  0.867
topic2         0.101   0.113  0.001   0.059  0.784
topic3         0.064   0.087  0.001   0.044  0.918
topic4         0.135   0.148  0.002   0.070  0.879
topic5         0.094   0.128  0.001   0.052  0.932
topic6         0.088   0.124  0.001   0.044  0.950
topic7         0.087   0.118  0.001   0.050  0.880
topic8         0.093   0.116  0.002   0.054  0.876
topic9         0.096   0.123  0.001   0.055  0.858
topic10        0.132   0.163  0.001   0.067  0.860
helpful logit  0.429   0.495  0.000   0.000  1.000


In [None]:
from scipy.stats import pearsonr

cols = [f'topic{i}' for i in range(1, 11)] + ['helpful logit']

corr_matrix = pd.DataFrame(index=cols, columns=cols)
p_matrix = pd.DataFrame(index=cols, columns=cols)

# Correlation and p-value calculation
for i in cols:
    for j in cols:
        if i == j:
            corr_matrix.loc[i, j] = 1.0
            p_matrix.loc[i, j] = 0.0
        else:
            x = data[i]
            y = data[j]
            corr, pval = pearsonr(x, y)
            corr_matrix.loc[i, j] = round(corr, 3)
            p_matrix.loc[i, j] = pval

# Significance annotation function
def significance(p):
    if p < 0.001:
        return '***'
    elif p < 0.01:
        return '**'
    elif p < 0.05:
        return '*'
    else:
        return ''

annotated_corr = corr_matrix.copy()
for i in cols:
    for j in cols:
        star = significance(p_matrix.loc[i, j])
        corr_val = corr_matrix.loc[i, j]
        annotated_corr.loc[i, j] = f'{corr_val}{star}'

print(annotated_corr)

                  topic1     topic2     topic3     topic4     topic5  \
topic1            1.0***  -0.177***  -0.118***     -0.018  -0.208***   
topic2         -0.177***     1.0***  -0.055***  -0.194***   0.123***   
topic3         -0.118***  -0.055***     1.0***  -0.132***  -0.057***   
topic4            -0.018  -0.194***  -0.132***     1.0***  -0.182***   
topic5         -0.208***   0.123***  -0.057***  -0.182***     1.0***   
topic6         -0.146***  -0.118***     -0.007  -0.146***  -0.094***   
topic7         -0.132***   -0.07***      -0.02  -0.156***  -0.115***   
topic8         -0.084***  -0.065***   -0.09***  -0.057***  -0.126***   
topic9         -0.088***  -0.124***  -0.107***  -0.086***  -0.111***   
topic10        -0.138***  -0.156***  -0.066***  -0.219***  -0.174***   
helpful logit   0.055***   0.048***  -0.057***      0.006   0.134***   

                  topic6     topic7     topic8     topic9    topic10  \
topic1         -0.146***  -0.132***  -0.084***  -0.088***  -0.1

#Logistic Regression

In [None]:
import statsmodels.api as sm

X_vars = [f'topic{i}' for i in range(1, 10)] # set topic 10 as reference category


X = data[X_vars]
y = data['helpful logit']

X = sm.add_constant(X)

# Fit logistic regression model
logit_model = sm.Logit(y, X)
result = logit_model.fit()

print(result.summary())

Optimization terminated successfully.
         Current function value: 0.650263
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:          helpful logit   No. Observations:                 5405
Model:                          Logit   Df Residuals:                     5395
Method:                           MLE   Df Model:                            9
Date:                Tue, 22 Jul 2025   Pseudo R-squ.:                 0.04799
Time:                        04:10:25   Log-Likelihood:                -3514.7
converged:                       True   LL-Null:                       -3691.9
Covariance Type:            nonrobust   LLR p-value:                 7.289e-71
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.5541      0.164     -9.469      0.000      -1.876      -1.232
topic1         2.0887      0.

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate VIF
X_no_const = X.drop(columns='const')

vif_df = pd.DataFrame({
    'Variable': X_no_const.columns,
    'VIF': [variance_inflation_factor(X_no_const.values, i) for i in range(X_no_const.shape[1])]
})

vif_df['VIF'] = vif_df['VIF'].round(3)

print(vif_df)

  Variable    VIF
0   topic1  1.357
1   topic2  1.553
2   topic3  1.360
3   topic4  1.443
4   topic5  1.374
5   topic6  1.284
6   topic7  1.305
7   topic8  1.399
8   topic9  1.344


In [None]:
print(f"AIC: {result.aic:.3f}")
print(f"BIC: {result.bic:.3f}")

AIC: 7049.344
BIC: 7115.295


In [None]:
import numpy as np
import pandas as pd

summary_table = pd.DataFrame({
    'Variable': result.params.index,
    'Coef': result.params.values,
    'StdErr': result.bse,
    'P-value': result.pvalues,
})

summary_table['Odds Ratio'] = np.exp(summary_table['Coef'])
summary_table['Signif'] = summary_table['P-value'].apply(
    lambda p: '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else ''
)

summary_table['Coef'] = summary_table['Coef'].round(3)
summary_table['Odds Ratio'] = summary_table['Odds Ratio'].round(3)
summary_table['P-value'] = summary_table['P-value'].round(3)

summary_table = summary_table[['Variable', 'Coef', 'Odds Ratio', 'P-value', 'Signif']]
print(summary_table)

       Variable   Coef  Odds Ratio  P-value Signif
const     const -1.554       0.211    0.000    ***
topic1   topic1  2.089       8.074    0.000    ***
topic2   topic2  1.617       5.039    0.000    ***
topic3   topic3  0.237       1.268    0.532       
topic4   topic4  1.230       3.423    0.000    ***
topic5   topic5  3.246      25.678    0.000    ***
topic6   topic6 -1.453       0.234    0.000    ***
topic7   topic7  1.867       6.467    0.000    ***
topic8   topic8  0.922       2.514    0.001     **
topic9   topic9  2.551      12.822    0.000    ***


# Word-level Logistic Regression

In [None]:
# Extract top 3 words for each topic
top_words = []
vocab = vectorizer.get_feature_names_out()
word_to_topics = {}

for topic_idx, topic in enumerate(lda_model.components_, start=1):
    top_indices = topic.argsort()[::-1][:3]
    for i in top_indices:
        word = vocab[i]
        top_words.append(word)
        if word in word_to_topics:
            word_to_topics[word].append(topic_idx)
        else:
            word_to_topics[word] = [topic_idx]

for word, topics in word_to_topics.items():
    print(f"{word} → topic(s): {topics}")

game → topic(s): [1, 4, 8]
play → topic(s): [1, 4]
headset → topic(s): [1, 2, 9, 10]
work → topic(s): [2]
issue → topic(s): [2]
controller → topic(s): [3]
button → topic(s): [3]
brille → topic(s): [3]
experience → topic(s): [4]
htc → topic(s): [5]
month → topic(s): [5]
base → topic(s): [5]
product → topic(s): [6]
price → topic(s): [6]
christmas → topic(s): [6]
phone → topic(s): [7]
samsung → topic(s): [7]
note → topic(s): [7]
vive → topic(s): [8, 9]
set → topic(s): [8]
rift → topic(s): [9]
quality → topic(s): [10]
lens → topic(s): [10]


In [None]:
# Remove duplicates
unique_top_words = list(dict.fromkeys(top_words))  # 순서 유지하면서 중복 제거
selected_words = unique_top_words[:30]

In [None]:
# Create binary indicators for word presence
for word in selected_words:
    data[f'has_{word}'] = data['processed_text'].apply(lambda x: 1 if word in x else 0)

In [None]:
binary_cols = [col for col in data.columns if col.startswith("has_")]
binary_cols.append('helpful logit')

binary_data = data[binary_cols]

summary_stats = binary_data.agg(['mean', 'std', 'min', 'max']).T
summary_stats.columns = ['Mean', 'Std', 'Min', 'Max']
summary_stats = summary_stats.round(3)

print("\n=== Fundamental Stats (Mean, Std, Min, Max) ===")
print(summary_stats)


=== Fundamental Stats (Mean, Std, Min, Max) ===
                 Mean    Std  Min  Max
has_game        0.214  0.410  0.0  1.0
has_play        0.147  0.354  0.0  1.0
has_headset     0.175  0.380  0.0  1.0
has_work        0.215  0.411  0.0  1.0
has_issue       0.083  0.276  0.0  1.0
has_controller  0.095  0.293  0.0  1.0
has_button      0.026  0.161  0.0  1.0
has_brille      0.008  0.088  0.0  1.0
has_experience  0.159  0.366  0.0  1.0
has_htc         0.092  0.289  0.0  1.0
has_month       0.049  0.216  0.0  1.0
has_base        0.042  0.201  0.0  1.0
has_product     0.141  0.348  0.0  1.0
has_price       0.100  0.300  0.0  1.0
has_christmas   0.021  0.144  0.0  1.0
has_phone       0.087  0.282  0.0  1.0
has_samsung     0.042  0.200  0.0  1.0
has_note        0.031  0.174  0.0  1.0
has_vive        0.148  0.355  0.0  1.0
has_set         0.089  0.285  0.0  1.0
has_rift        0.043  0.204  0.0  1.0
has_quality     0.083  0.275  0.0  1.0
has_lens        0.047  0.212  0.0  1.0
helpful logit  

In [None]:
from scipy.stats import pearsonr
import pandas as pd

# Correlation and p-value calculation
corr_matrix = pd.DataFrame(index=binary_cols, columns=binary_cols)
pval_matrix = pd.DataFrame(index=binary_cols, columns=binary_cols)

for col1 in binary_cols:
    for col2 in binary_cols:
        if col1 == col2:
            corr_matrix.loc[col1, col2] = 1.0
            pval_matrix.loc[col1, col2] = 0.0
        else:
            r, p = pearsonr(binary_data[col1], binary_data[col2])
            corr_matrix.loc[col1, col2] = r
            pval_matrix.loc[col1, col2] = p

# Significance annotation function
def get_stars(p):
    if p < 0.001:
        return '***'
    elif p < 0.01:
        return '**'
    elif p < 0.05:
        return '*'
    else:
        return ''

annotated_corr = pd.DataFrame(index=binary_cols, columns=binary_cols)

for i, col1 in enumerate(binary_cols):
    for j, col2 in enumerate(binary_cols):
        if i < j:
            annotated_corr.loc[col1, col2] = ''
        elif i == j:
            annotated_corr.loc[col1, col2] = "1.00"
        else:
            r = corr_matrix.loc[col1, col2]
            p = pval_matrix.loc[col1, col2]
            annotated_corr.loc[col1, col2] = f"{float(r):.3f}{get_stars(float(p))}"

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

print("\n=== Correlation Matrix (with significance stars) ===")
print(annotated_corr)


=== Correlation Matrix (with significance stars) ===
                has_game  has_play has_headset   has_work has_issue  \
has_game            1.00                                              
has_play        0.486***      1.00                                    
has_headset     0.249***  0.238***        1.00                        
has_work        0.058***  0.094***    0.185***       1.00             
has_issue       0.144***  0.163***    0.215***   0.200***      1.00   
has_controller  0.226***  0.204***    0.295***   0.164***  0.221***   
has_button       0.043**  0.075***    0.112***   0.116***  0.109***   
has_brille      -0.041**  -0.037**      -0.024  -0.046***    -0.027   
has_experience  0.311***  0.251***    0.238***   0.064***  0.162***   
has_htc         0.154***  0.145***    0.186***   0.148***  0.222***   
has_month       0.104***  0.105***    0.114***   0.134***  0.137***   
has_base        0.088***  0.118***    0.160***   0.119***  0.177***   
has_product        0.00

In [None]:
import statsmodels.api as sm

X_vars = [col for col in data.columns if col.startswith('has_')]

X = data[X_vars]
y = data['helpful logit']

X = sm.add_constant(X)

# Fit logit regression model
logit_model = sm.Logit(y, X)
result = logit_model.fit()

print(result.summary())

Optimization terminated successfully.
         Current function value: 0.609706
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:          helpful logit   No. Observations:                 5405
Model:                          Logit   Df Residuals:                     5381
Method:                           MLE   Df Model:                           23
Date:                Tue, 22 Jul 2025   Pseudo R-squ.:                  0.1074
Time:                        04:10:26   Log-Likelihood:                -3295.5
converged:                       True   LL-Null:                       -3691.9
Covariance Type:            nonrobust   LLR p-value:                1.162e-152
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -0.9576      0.043    -22.414      0.000      -1.041      -0.874
has_game         

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate VIF
X_no_const = X.drop(columns='const')

vif_df = pd.DataFrame({
    'Variable': X_no_const.columns,
    'VIF': [variance_inflation_factor(X_no_const.values, i) for i in range(X_no_const.shape[1])]
})

vif_df['VIF'] = vif_df['VIF'].round(3)

print(vif_df)

          Variable    VIF
0         has_game  1.785
1         has_play  1.618
2      has_headset  1.527
3         has_work  1.350
4        has_issue  1.291
5   has_controller  1.364
6       has_button  1.107
7       has_brille  1.001
8   has_experience  1.420
9          has_htc  1.568
10       has_month  1.162
11        has_base  1.168
12     has_product  1.155
13       has_price  1.174
14   has_christmas  1.025
15       has_phone  1.231
16     has_samsung  1.153
17        has_note  1.112
18        has_vive  1.837
19         has_set  1.214
20        has_rift  1.184
21     has_quality  1.145
22        has_lens  1.095


In [None]:
print(f"AIC: {result.aic:.3f}")
print(f"BIC: {result.bic:.3f}")

AIC: 6638.925
BIC: 6797.207


In [None]:
import numpy as np
import pandas as pd

summary_table = pd.DataFrame({
    'Variable': result.params.index,
    'Coef': result.params.values,
    'StdErr': result.bse,
    'P-value': result.pvalues,
})

summary_table['Odds Ratio'] = np.exp(summary_table['Coef'])
summary_table['Signif'] = summary_table['P-value'].apply(
    lambda p: '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else ''
)

summary_table['Coef'] = summary_table['Coef'].round(3)
summary_table['Odds Ratio'] = summary_table['Odds Ratio'].round(3)
summary_table['P-value'] = summary_table['P-value'].round(3)

summary_table = summary_table[['Variable', 'Coef', 'Odds Ratio', 'P-value', 'Signif']]
print(summary_table)

                      Variable   Coef  Odds Ratio  P-value Signif
const                    const -0.958       0.384    0.000    ***
has_game              has_game  0.598       1.819    0.000    ***
has_play              has_play  0.059       1.060    0.553       
has_headset        has_headset  0.464       1.591    0.000    ***
has_work              has_work  0.092       1.096    0.230       
has_issue            has_issue  0.434       1.544    0.000    ***
has_controller  has_controller  0.202       1.224    0.081       
has_button          has_button  0.489       1.631    0.015      *
has_brille          has_brille  0.959       2.610    0.002     **
has_experience  has_experience  0.474       1.606    0.000    ***
has_htc                has_htc  0.682       1.979    0.000    ***
has_month            has_month  0.925       2.523    0.000    ***
has_base              has_base  0.554       1.740    0.001     **
has_product        has_product  0.099       1.104    0.260       
has_price 