## SENTIMENTAL ANALYSIS

## SPRINT 1 - Create DataFrame from raw text files

In [20]:
import pandas as pd

In [2]:
# Sample data extracted from the provided image
data = """
ProductId: B001E4KFG0
UserId: A3SGXH7AUHU8GW
ProfileName: delmartian
HelpfulnessNumerator: 1
HelpfulnessDenominator: 1
Score: 5
Time: 1303862400
ReviewSummary: Good Quality Dog Food
ReviewText: I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than most.
"""

In [3]:
# Parsing function to extract fields
def parse_review_data(text):
    lines = text.strip().split('\n')
    review_data = {}
    for line in lines:
        key, value = line.split(': ', 1)
        review_data[key] = value
    return review_data

# Parse the sample data
parsed_data = parse_review_data(data)

# Convert parsed data into DataFrame
df = pd.DataFrame([parsed_data])

# Add Id column
df.reset_index(inplace=True)
df.rename(columns={'index': 'Id'}, inplace=True)

# Save DataFrame to CSV
df.to_csv('reviews.csv', index=False)
print(df)

   Id   ProductId          UserId ProfileName HelpfulnessNumerator  \
0   0  B001E4KFG0  A3SGXH7AUHU8GW  delmartian                    1   

  HelpfulnessDenominator Score        Time          ReviewSummary  \
0                      1     5  1303862400  Good Quality Dog Food   

                                          ReviewText  
0  I have bought several of the Vitality canned d...  


In [4]:
import os

In [5]:
directory =r"C:\Users\navee\OneDrive\Desktop\text_reviews"

In [6]:
# Initialize an empty list to hold review data
reviews = []

# Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        with open(os.path.join(directory, filename), 'r') as file:
            data = file.read()
            parsed_data = parse_review_data(data)
            reviews.append(parsed_data)

# Convert list of dictionaries to DataFrame
df = pd.DataFrame(reviews)

# Add Id column
df.reset_index(inplace=True)
df.rename(columns={'index': 'Id'}, inplace=True)

# Save DataFrame to CSV
df.to_csv('reviews.csv',index=False)

## SPRINT 2 - Build a model

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import string

In [10]:
# Load the data
df =pd.read_csv(r"reviews.csv")

In [11]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,ReviewSummary,ReviewText
0,0,B000LQORDE,A19W47CXJJP1MI,Amazonian Consumer,2,5,5,1235088000,"This spicy noodle cures my cold, upset stomach...",I love this noodle and have it once or twice a...
1,1,B000LQORDE,A13LMI7F7UC2VO,Super Villain,6,12,3,1251244800,I'm spicyyyyyyyyyy,That's pretty much what these ramen noodles ha...
2,2,B000LQORDE,A29WRXXYKLFTG,zhenzhen,0,2,5,1320537600,I like it,The taste is great! especially when you cook i...
3,3,B000LQORDE,A2LKCOIVLZWDG5,TopQuark,0,2,5,1320537600,The best instant noodle!,This is the best instant noodle I have tried. ...
4,4,B000LQORDE,A1GPN9X27K5WN,"Chase A Byrd ""byrd720""",0,2,5,1319932800,Awesome!,I don't see how anyone could say anything bad ...


In [12]:
# Basic text preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercase text
    text = ''.join([c for c in text if c not in string.punctuation])  # Remove punctuation
    return text

In [13]:
# Apply text preprocessing
df['ReviewText'] = df['ReviewText'].apply(preprocess_text)

In [14]:
# Feature extraction using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features
X = tfidf.fit_transform(df['ReviewText']).toarray()
y = df['Score']

In [15]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

In [17]:
# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'{name}: MSE = {mse:.2f}, R^2 = {r2:.2f}')

Linear Regression: MSE = 1.50, R^2 = 0.09
Random Forest: MSE = 1.57, R^2 = 0.04
Gradient Boosting: MSE = 1.59, R^2 = 0.03


In [18]:
# Select the best model based on evaluation metrics
best_model_name = min(models, key=lambda name: mean_squared_error(y_test, models[name].predict(X_test)))
best_model = models[best_model_name]
print(f'Best Model: {best_model_name}')

Best Model: Linear Regression
