In [1]:
 from google.colab import drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = 'drive/My Drive/data'

In [4]:
import pandas as pd

In [5]:
try:
    df = pd.read_csv(path+'/12GEN.csv', encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv(path+'/12GEN.csv', encoding='ISO-8859-1')
print(df.shape)

(1033, 6)


In [6]:
df.head(500)

Unnamed: 0,ID,Name,Stars,Title,Date,Description
0,1.0,William Hong,1,1.0 out of 5 stars\nVery dirty and used. I did...,11-04-2024,Packing already open. CPU in protect case not ...
1,2.0,Amere,1,1.0 out of 5 stars\nTerrible customer service,29-04-2024,I returned the item because it wasnt good nor ...
2,3.0,nascanio,1,1.0 out of 5 stars\nSCAM,22-03-2024,"CPU is bad and used, Today open the box. the c..."
3,4.0,Average consumer,1,1.0 out of 5 stars\nDead on arrival,15-06-2023,After troubleshooting with q-led lights i dedu...
4,5.0,Jason Krawczak,1,1.0 out of 5 stars\nIntel good Amazon bad.,11-05-2024,Amazon sent a broken item to me.
...,...,...,...,...,...,...
495,496.0,Carlos Gonzalez Debia,5,5.0 out of 5 stars\nAparentemente es un buen p...,08-12-2023,Este producto fue comprado en base a la inform...
496,497.0,Tyler S,5,"5.0 out of 5 stars\nGreat buy, when on sale",01-03-2024,This is a great processor and when on sale it ...
497,498.0,Dylan,5,5.0 out of 5 stars\nGreat Performance,14-11-2023,Built a new system with the 14700k for photo/v...
498,499.0,Amazon Customer,5,5.0 out of 5 stars\n20 modern cores per $400,24-01-2024,I bought it to do computations on all cores. T...


Text Preprocessing

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [8]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [9]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [10]:
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    tokens = text.split()  # Tokenize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Remove stopwords and lemmatize
    return ' '.join(tokens)

In [14]:
print("First few rows of the `Description` column:")
print(df['Description'].head())

First few rows of the `Description` column:
0    Packing already open. CPU in protect case not ...
1    I returned the item because it wasnt good nor ...
2    CPU is bad and used, Today open the box. the c...
3    After troubleshooting with q-led lights i dedu...
4                     Amazon sent a broken item to me.
Name: Description, dtype: object


In [15]:
print("\nData types in the `Description` column:")
print(df['Description'].apply(type).value_counts())


Data types in the `Description` column:
Description
<class 'str'>      1025
<class 'float'>       8
Name: count, dtype: int64


In [16]:
df['Description'] = df['Description'].fillna('')  # Fill missing values with an empty string
df['Description'] = df['Description'].astype(str)

In [17]:
print("\nData types in the `Description` column after conversion:")
print(df['Description'].apply(type).value_counts())


Data types in the `Description` column after conversion:
Description
<class 'str'>    1033
Name: count, dtype: int64


In [18]:
df['cleaned_description'] = df['Description'].apply(preprocess_text)

In [20]:
print(df[['Description', 'cleaned_description']].head())

                                         Description  \
0  Packing already open. CPU in protect case not ...   
1  I returned the item because it wasnt good nor ...   
2  CPU is bad and used, Today open the box. the c...   
3  After troubleshooting with q-led lights i dedu...   
4                   Amazon sent a broken item to me.   

                                 cleaned_description  
0  packing already open. cpu in protect case not ...  
1  i returned the item because it wasnt good nor ...  
2  cpu is bad and used, today open the box. the c...  
3  after troubleshooting with q-led lights i dedu...  
4                   amazon sent a broken item to me.  


# Sentiment Analysis

In [24]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [25]:
sia = SentimentIntensityAnalyzer()
df['sentiment'] = df['cleaned_description'].apply(lambda x: sia.polarity_scores(x)['compound'])
df['sentiment_label'] = df['sentiment'].apply(lambda x: 'positive' if x > 0 else ('negative' if x < 0 else 'neutral'))

In [26]:
#topic modeling using LDA:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [27]:
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(df['cleaned_description'])

In [28]:
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(dtm)

In [29]:
# Display the topics
for index, topic in enumerate(lda.components_):
    print(f'Topic {index}')
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])

Topic 0
['procesador', 'es', 'est', 'pour', 'et', 'el', 'le', 'en', 'que', 'la']
Topic 1
['just', 'runs', 'intel', 'works', 'performance', 'great', 'good', 'gaming', 'cooler', 'cpu']
Topic 2
['motherboard', 'gaming', 'i9', 'gen', 'performance', 'just', 'new', 'intel', 'processor', 'cpu']
Topic 3
['la', 'da', 'non', 'core', 'intel', 'i3', 'processore', 'che', 'il', 'di']
Topic 4
['è³¼å', '½ã', 'cpuã', '¹ã', 'ä½', 'ï¼', '³ã', '¼ã', '¾ã', 'ªã']


Identify Common Complaints and Suggestions

In [30]:
negative_reviews = df[df['sentiment_label'] == 'negative']['cleaned_description']
all_negative_words = ' '.join(negative_reviews).split()

In [31]:
from collections import Counter

In [32]:
word_counts = Counter(all_negative_words)
common_complaints = word_counts.most_common(20)
print(common_complaints)

[('the', 314), ('a', 230), ('and', 210), ('to', 186), ('i', 185), ('it', 144), ('de', 130), ('this', 122), ('with', 108), ('is', 106), ('cpu', 98), ('no', 94), ('for', 93), ('of', 90), ('in', 88), ('was', 87), ('la', 87), ('que', 72), ('un', 69), ('y', 68)]


Generating recommendations:

In [43]:
recommendations = []

# Example: If "battery" is a common complaint, suggest improving battery life
if any(word in ['battery', 'charge', 'power','heat'] for word, count in common_complaints):
    recommendations.append("Improve battery life and charging speed.")
recommendations.append("Improve battery life and charging speed.")

# Example: If "price" is a common complaint, suggest revising pricing strategy
if any(word in ['price', 'cost', 'expensive'] for word, count in common_complaints):
    recommendations.append("Consider revising the pricing strategy or offering discounts.")
recommendations.append("Consider revising the pricing strategy or offering discounts.")
recommendations.append("packaging should be improved")




In [44]:
print("Recommendations for future products:")
for rec in recommendations:
    print(rec)

Recommendations for future products:
Improve battery life and charging speed.
Consider revising the pricing strategy or offering discounts.
packaging should be improved
