In [7]:
import pandas as pd
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [8]:
review_df = pd.read_csv("amazon-customer-reviews.csv")
review_df.head()

Unnamed: 0,userID,review_date,productID,review_helpful,review_text,review_summary
0,A1BWD4UA5QTBOI,2013-10-28,B004BQKQ8A,"[0, 0]","['card', 'work', 'expected', 'fast', 'great', ...","['bought', 'older', 'computer']"
1,A27JB8ALUWAECO,2014-01-16,B005O74J7O,"[0, 0]","['well', 'case', 'go', 'one', 'look', 'good', ...",[]
2,A32B6A532454UD,2012-06-12,B005P9CATU,"[10, 10]","['using', 'app', 'power', 'connector', 'year',...","['connector', 'standard', 'radio', 'us', '30',..."
3,A225M3GAYN01IG,2011-04-10,B0023APPCI,"[1, 1]","['model', 'use', 'led', 'hd', 'tv', 'bluray', ...","['great', 'sound', 'easy', 'set', 'look', 'ama..."
4,AX2F72W2O86WI,2012-06-07,B0055LGG3Y,"[3, 4]","['system', 'using', 'memory', 'system', 'purch...","['great', 'memory']"


In [9]:
def analyse_sematic(text):
    words = word_tokenize(text)
    analysis = TextBlob("".join(words))
    sentiment = analysis.sentiment.polarity
    return sentiment
review_df['semantic_value_text'] = review_df['review_text'].apply(analyse_sematic)
review_df['semantic_value_summary'] = review_df['review_summary'].apply(analyse_sematic)
review_df.head()

Unnamed: 0,userID,review_date,productID,review_helpful,review_text,review_summary,semantic_value_text,semantic_value_summary
0,A1BWD4UA5QTBOI,2013-10-28,B004BQKQ8A,"[0, 0]","['card', 'work', 'expected', 'fast', 'great', ...","['bought', 'older', 'computer']",0.33,0.166667
1,A27JB8ALUWAECO,2014-01-16,B005O74J7O,"[0, 0]","['well', 'case', 'go', 'one', 'look', 'good', ...",[],0.090278,0.0
2,A32B6A532454UD,2012-06-12,B005P9CATU,"[10, 10]","['using', 'app', 'power', 'connector', 'year',...","['connector', 'standard', 'radio', 'us', '30',...",0.111329,0.0
3,A225M3GAYN01IG,2011-04-10,B0023APPCI,"[1, 1]","['model', 'use', 'led', 'hd', 'tv', 'bluray', ...","['great', 'sound', 'easy', 'set', 'look', 'ama...",0.42,0.558333
4,AX2F72W2O86WI,2012-06-07,B0055LGG3Y,"[3, 4]","['system', 'using', 'memory', 'system', 'purch...","['great', 'memory']",0.0,0.8


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Assuming 'semantic_analysis' is the target variable
X = review_df['review_summary']
y = review_df['semantic_value_summary']

# Convert semantic values to classes (negative, neutral, positive)
y_classes = pd.cut(y, bins=[float('-inf'), 0.0, float('inf')], labels=['negative', 'positive'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_classes, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Choose a model (Logistic Regression in this case)
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Make predictions
predictions = model.predict(X_test_vec)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")


Accuracy: 0.9775


In [16]:
from sklearn.feature_extraction.text import CountVectorizer

# Assuming 'review_df' is your main DataFrame
negative_reviews_df = review_df[review_df['semantic_value_text'] < 0]

# Extract negative reviews
negative_reviews = negative_reviews_df['review_text'].tolist()

# Initialize CountVectorizer
vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the text data
X = vectorizer.fit_transform(negative_reviews)

# Print the feature names
feature_names = vectorizer.get_feature_names_out()
print("Feature Names:", feature_names)

# Print the CountVectorizer matrix
print("CountVectorizer Matrix:")
print(X.toarray())


Feature Names: ['07' '0f' '10' '100' '1000' '10000' '100240v' '100ft' '100gb' '100mbsusb'
 '100pack' '100repair' '101' '1010' '10100' '101mp' '102607ok' '103'
 '1063' '1074cloning' '1080p' '1080p60fps' '10gb' '10mb' '10mbitsec' '11'
 '110220' '1116' '114' '115' '1150' '12' '120' '120gig' '120mb' '120mm'
 '121' '125v' '128gb' '12v' '12x' '13' '1300' '135' '14' '1442mm' '1450'
 '146mp' '14inch' '14x' '15' '150' '152' '15fps' '15tb' '15yrold' '16'
 '160gb' '165' '1680x1050' '16gb' '16gig' '16oc' '16th' '16x' '17' '18'
 '180' '19' '190mbs' '1920x1080' '192168176' '192x' '195' '1960' '1980'
 '1990' '1990same' '1999' '19991' '1a' '1d' '1gbitsec' '1mb' '1st'
 '1starextreeeeeeeeemly' '1tb' '20' '200' '20000' '200001' '2001' '2005'
 '2006' '2007' '2008' '2008i' '2009' '200aw' '2010' '20102011' '2012'
 '2013' '2014' '20esata' '21' '211' '218' '22' '220v' '220volt' '23' '24'
 '240' '247' '24mm' '24x' '25' '250' '250gb' '256g' '256gb' '2595lmtcase'
 '25inch' '27' '28' '2after' '2gb' '2nd' '2pm' '2

In [20]:
import numpy as np

# Assuming 'X' is your CountVectorizer matrix
top_features_indices = np.argsort(-X.sum(axis=0))  # Sort features by frequency

# Get the feature names corresponding to the top indices
top_feature_names = np.array(feature_names)[top_features_indices]

# Print the top 10 features
print("Top 10 Features:")
print(top_feature_names[:10])

Top 10 Features:
[['drive' 'work' 'hard' 'use' 'like' 'little' 'bad' 'time' 'case' 'usb'
  'product' 'bought' 'problem' 'battery' 'using' 'cable' 'camera' 'card'
  'need' 'remote' 'laptop' 'small' 'thing' 'used' 'device' 'long' 'buy'
  'make' 'help' 'game' 'tv' 'review' 'computer' 'lens' 'price' 'year'
  'power' 'purchased' 'expensive' 'way' 'really' 'unit' 'complaint' 'say'
  'want' 'star' 'support' 'quality' 'look' 'item' 'wrong' 'amazon'
  'worked' 'player' 'come' 'got' 'screen' 'recommend' 'ipad' 'mouse'
  'port' 'button' 'adapter' 'new' 'expected' 'helpful' 'know' 'external'
  'plug' 'play' 'phone' 'flash' 'month' 'bit' 'tried' '20' 'disappointed'
  '34' 'day' 'hold' 'lot' 'second' 'issue' 'picture' 'video' 'cord'
  'money' 'going' 'pc' 'black' 'far' 'box' 'set' '30' 'solid' 'slow'
  'speed' 'poor' 'software' 'working' 'charge' 'feel' 'read' 'think' 'le'
  'needed' 'probably' 'mean' 'inside' 'extra' 'week' 'connection' 'good'
  'difficult' '10' 'keyboard' 'apple' 'mac' 'old' 'dvd'

In [18]:
print(top_feature_names[:10])

[['drive' 'work' 'hard' 'use' 'like' 'little' 'bad' 'time' 'case' 'usb'
  'product' 'bought' 'problem' 'battery' 'using' 'cable' 'camera' 'card'
  'need' 'remote' 'laptop' 'small' 'thing' 'used' 'device' 'long' 'buy'
  'make' 'help' 'game' 'tv' 'review' 'computer' 'lens' 'price' 'year'
  'power' 'purchased' 'expensive' 'way' 'really' 'unit' 'complaint' 'say'
  'want' 'star' 'support' 'quality' 'look' 'item' 'wrong' 'amazon'
  'worked' 'player' 'come' 'got' 'screen' 'recommend' 'ipad' 'mouse'
  'port' 'button' 'adapter' 'new' 'expected' 'helpful' 'know' 'external'
  'plug' 'play' 'phone' 'flash' 'month' 'bit' 'tried' '20' 'disappointed'
  '34' 'day' 'hold' 'lot' 'second' 'issue' 'picture' 'video' 'cord'
  'money' 'going' 'pc' 'black' 'far' 'box' 'set' '30' 'solid' 'slow'
  'speed' 'poor' 'software' 'working' 'charge' 'feel' 'read' 'think' 'le'
  'needed' 'probably' 'mean' 'inside' 'extra' 'week' 'connection' 'good'
  'difficult' '10' 'keyboard' 'apple' 'mac' 'old' 'dvd' 'charger'
  'con