In [2]:
import pandas as pd
import json
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Download stopwords
print("Downloading NLTK stopwords...")
nltk.download('stopwords')

# Load the JSONL file into a DataFrame
print("Loading dataset...")
file_path = "Cell_Phones_and_Accessories_5.json"
data = []
with open(file_path, 'r') as f:
    for line in f:
        data.append(json.loads(line))
df = pd.DataFrame(data)
print("Dataset loaded successfully.")

# Data Cleaning
print("Cleaning data...")
df.dropna(subset=['reviewText', 'overall'], inplace=True)  # Remove rows with missing values
df['reviewText'] = df['reviewText'].str.lower()  # Convert text to lowercase
df['reviewText'] = df['reviewText'].str.replace(r'[^\w\s]', '', regex=True)  # Remove punctuation

# Remove stopwords
print("Removing stopwords...")
stop_words = set(stopwords.words('english'))
df['filteredText'] = df['reviewText'].apply(
    lambda x: " ".join(word for word in x.split() if word not in stop_words)
)
print("Data cleaning completed.")

# Dataset Splitting
print("Splitting dataset into train, validation, and test sets...")
train, temp = train_test_split(df, test_size=0.2, random_state=42)
validation, test = train_test_split(temp, test_size=0.5, random_state=42)
print("Dataset split successfully.")



Downloading NLTK stopwords...
Loading dataset...


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kcasimiro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Dataset loaded successfully.
Cleaning data...
Removing stopwords...
Data cleaning completed.
Splitting dataset into train, validation, and test sets...
Dataset split successfully.


In [None]:
# Bag of Words Vectorization
print("Vectorizing text using Bag of Words...")
vectorizer = CountVectorizer(max_features=10000)  # Limit to top 10,000 words
X_train = vectorizer.fit_transform(train['filteredText']).toarray()
X_validation = vectorizer.transform(validation['filteredText']).toarray()
X_test = vectorizer.transform(test['filteredText']).toarray()
print("Vectorization completed.")

# Target Variable
y_train = train['overall']
y_validation = validation['overall']
y_test = test['overall']

# Parameter Grid for Logistic Regression
param_grid = {'C': [0.001, 0.0001, 0.00001, 0.01, 0.1, 1, 10]}
logistic = LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

# Grid Search for Optimal C
print("Performing Grid Search for optimal hyperparameter C...")
grid_search = GridSearchCV(logistic, param_grid, cv=5, scoring='accuracy', verbose=1, return_train_score=True)
grid_search.fit(X_train, y_train)
print("Grid Search completed.")

# Output validation accuracies for each C value
results = pd.DataFrame(grid_search.cv_results_)
print("Validation Accuracies for Different C Values:")
for c, mean_val_score in zip(results['param_C'], results['mean_test_score']):
    print(f"C: {c} -> Validation Accuracy: {mean_val_score:.4f}")

# Best Model Evaluation
best_c = grid_search.best_params_['C']
print(f"\nOptimal C value: {best_c}")

print("Evaluating the best model on the test set...")
best_model = grid_search.best_estimator_
y_test_pred = best_model.predict(X_test)

# Test Accuracy and Classification Report
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"\nTest Accuracy with optimal C: {test_accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

# Save Vectorizer and Model (Optional)
print("Saving the vectorizer and model...")
import pickle
with open('bow_vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)
with open('logistic_model.pkl', 'wb') as model_file:
    pickle.dump(best_model, model_file)
print("Vectorizer and model saved successfully.")


Vectorizing text using Bag of Words...


In [4]:
import pandas as pd
import json
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import pickle
import gc  # Garbage collection for memory management

# Download stopwords
print("Downloading NLTK stopwords...")
nltk.download('stopwords')

# Load a subset of the JSONL file into a DataFrame
print("Loading dataset...")
file_path = "Cell_Phones_and_Accessories_5.json"
data = []
with open(file_path, 'r') as f:
    for i, line in enumerate(f):
        if i >= 10000:  # Limit to 10,000 reviews for debugging
            break
        data.append(json.loads(line))
df = pd.DataFrame(data)
print(f"Dataset loaded successfully with {len(df)} records.")

# Data Cleaning
print("Cleaning data...")
df.dropna(subset=['reviewText', 'overall'], inplace=True)  # Remove rows with missing values
df['reviewText'] = df['reviewText'].str.lower()  # Convert text to lowercase
df['reviewText'] = df['reviewText'].str.replace(r'[^\w\s]', '', regex=True)  # Remove punctuation

# Remove stopwords
print("Removing stopwords...")
stop_words = set(stopwords.words('english'))
df['filteredText'] = df['reviewText'].apply(
    lambda x: " ".join(word for word in x.split() if word not in stop_words)
)
print("Data cleaning completed.")

# Dataset Splitting
print("Splitting dataset into train, validation, and test sets...")
train, temp = train_test_split(df, test_size=0.2, random_state=42, stratify=df['overall'])
validation, test = train_test_split(temp, test_size=0.5, random_state=42, stratify=temp['overall'])
print("Dataset split successfully.")

# Bag of Words Vectorization
print("Vectorizing text using Bag of Words...")
vectorizer = CountVectorizer(max_features=10000)  # Limit to top 10,000 words
X_train = vectorizer.fit_transform(train['filteredText'])  # Sparse matrix
X_validation = vectorizer.transform(validation['filteredText'])  # Sparse matrix
X_test = vectorizer.transform(test['filteredText'])  # Sparse matrix
print("Vectorization completed.")

# Target Variable
y_train = train['overall']
y_validation = validation['overall']
y_test = test['overall']

# Parameter Grid for Logistic Regression
param_grid = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]}
logistic = LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

# Grid Search for Optimal C
print("Performing Grid Search for optimal hyperparameter C...")
grid_search = GridSearchCV(
    logistic, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1
)
grid_search.fit(X_train, y_train)
print("Grid Search completed.")

# Output validation accuracies for each C value
results = pd.DataFrame(grid_search.cv_results_)
print("Validation Accuracies for Different C Values:")
for c, mean_val_score in zip(results['param_C'], results['mean_test_score']):
    print(f"C: {c} -> Validation Accuracy: {mean_val_score:.4f}")

# Best Model Evaluation
best_c = grid_search.best_params_['C']
print(f"\nOptimal C value: {best_c}")

print("Evaluating the best model on the test set...")
best_model = grid_search.best_estimator_
y_test_pred = best_model.predict(X_test)

# Test Accuracy and Classification Report
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"\nTest Accuracy with optimal C: {test_accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

# Save Vectorizer and Model
print("Saving the vectorizer and model...")
with open('bow_vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)
with open('logistic_model.pkl', 'wb') as model_file:
    pickle.dump(best_model, model_file)
print("Vectorizer and model saved successfully.")

# Cleanup
del X_train, X_validation, X_test
gc.collect()  # Free memory


Downloading NLTK stopwords...
Loading dataset...
Dataset loaded successfully with 10000 records.
Cleaning data...
Removing stopwords...


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kcasimiro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data cleaning completed.
Splitting dataset into train, validation, and test sets...
Dataset split successfully.
Vectorizing text using Bag of Words...
Vectorization completed.
Performing Grid Search for optimal hyperparameter C...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Grid Search completed.
Validation Accuracies for Different C Values:
C: 0.0001 -> Validation Accuracy: 0.5169
C: 0.001 -> Validation Accuracy: 0.5205
C: 0.01 -> Validation Accuracy: 0.5515
C: 0.1 -> Validation Accuracy: 0.5827
C: 1 -> Validation Accuracy: 0.5650
C: 10 -> Validation Accuracy: 0.5326
C: 100 -> Validation Accuracy: 0.5134
C: 1000 -> Validation Accuracy: 0.5004

Optimal C value: 0.1
Evaluating the best model on the test set...

Test Accuracy with optimal C: 0.58

Classification Report:
              precision    recall  f1-score   support

         1.0       0.55      0.43      0.49        97
         2.0       0.33      0.14      0.20        71
         3.0       0.42      0.14      0.2

0

[CV] END ............................................C=0.001; total time=   0.1s
[CV] END ..............................................C=0.1; total time=   0.3s
[CV] END ................................................C=1; total time=   0.6s
[CV] END ............................................C=0.001; total time=   0.1s
[CV] END .............................................C=0.01; total time=   0.2s
[CV] END ................................................C=1; total time=   0.6s
[CV] END ...............................................C=10; total time=   1.1s
[CV] END ............................................C=0.001; total time=   0.1s
[CV] END .............................................C=0.01; total time=   0.2s
[CV] END ...............................................C=10; total time=   1.1s
[CV] END ...............................................C=10; total time=   1.2s
[CV] END .............................................C=0.01; total time=   0.2s
[CV] END ...................

In [3]:
import pandas as pd
import json
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest, chi2
import pickle
import gc

# Download stopwords
print("Downloading NLTK stopwords...")
nltk.download('stopwords')

# Load a subset of the dataset for optimization
print("Loading dataset...")
file_path = "Cell_Phones_and_Accessories_5.json"
data = []
with open(file_path, 'r') as f:
    for i, line in enumerate(f):
        if i >= 10000:  # Limit to first 10,000 reviews for optimization
            break
        data.append(json.loads(line))
df = pd.DataFrame(data)
print(f"Dataset loaded successfully with {len(df)} records.")

# Data Cleaning
print("Cleaning data...")
df.dropna(subset=['reviewText', 'overall'], inplace=True)  # Remove rows with missing values
df['reviewText'] = df['reviewText'].str.lower()  # Convert text to lowercase
df['reviewText'] = df['reviewText'].str.replace(r'[^\w\s]', '', regex=True)  # Remove punctuation

# Remove stopwords
print("Removing stopwords...")
stop_words = set(stopwords.words('english'))
df['filteredText'] = df['reviewText'].apply(
    lambda x: " ".join(word for word in x.split() if word not in stop_words)
)
print("Data cleaning completed.")

# Dataset Splitting
print("Splitting dataset into train, validation, and test sets...")
train, temp = train_test_split(df, test_size=0.2, random_state=42, stratify=df['overall'])
validation, test = train_test_split(temp, test_size=0.5, random_state=42, stratify=temp['overall'])
print("Dataset split successfully.")

# TF-IDF Vectorization
print("Vectorizing text using TF-IDF...")
vectorizer = TfidfVectorizer(
    max_features=10000,  # Top 10,000 words
    min_df=5,  # Ignore words in fewer than 5 documents
    max_df=0.9,  # Ignore very common words in 90%+ documents
    ngram_range=(1, 2)  # Unigrams and bigrams
)
X_train = vectorizer.fit_transform(train['filteredText'])  # Sparse matrix
X_validation = vectorizer.transform(validation['filteredText'])  # Sparse matrix
X_test = vectorizer.transform(test['filteredText'])  # Sparse matrix
print("Vectorization completed.")

# Feature Selection
print("Selecting top features using Chi-Square Test...")
selector = SelectKBest(chi2, k=5000)  # Select top 5,000 features
X_train_reduced = selector.fit_transform(X_train, train['overall'])
X_validation_reduced = selector.transform(X_validation)
X_test_reduced = selector.transform(X_test)
print("Feature selection completed.")

# Target Variable
y_train = train['overall']
y_validation = validation['overall']
y_test = test['overall']

# Parameter Grid for Logistic Regression
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
logistic = LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

# Grid Search for Optimal C
print("Performing Grid Search for optimal hyperparameter C...")
grid_search = GridSearchCV(
    logistic, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1
)
grid_search.fit(X_train_reduced, y_train)
print("Grid Search completed.")

# Output validation accuracies for each C value
results = pd.DataFrame(grid_search.cv_results_)
print("Validation Accuracies for Different C Values:")
for c, mean_val_score in zip(results['param_C'], results['mean_test_score']):
    print(f"C: {c} -> Validation Accuracy: {mean_val_score:.4f}")

# Best Model Evaluation
best_c = grid_search.best_params_['C']
print(f"\nOptimal C value: {best_c}")

print("Evaluating the best model on the test set...")
best_model = grid_search.best_estimator_
y_test_pred = best_model.predict(X_test_reduced)

# Test Accuracy and Classification Report
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"\nTest Accuracy with optimal C: {test_accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

# Save Vectorizer, Selector, and Model
print("Saving the vectorizer, selector, and model...")
with open('tfidf_vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)
with open('feature_selector.pkl', 'wb') as sel_file:
    pickle.dump(selector, sel_file)
with open('logistic_model.pkl', 'wb') as model_file:
    pickle.dump(best_model, model_file)
print("Vectorizer, selector, and model saved successfully.")

# Cleanup
del X_train, X_train_reduced, X_validation, X_validation_reduced, X_test, X_test_reduced
gc.collect()  # Free memory


Downloading NLTK stopwords...
Loading dataset...
Dataset loaded successfully with 10000 records.
Cleaning data...
Removing stopwords...


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kcasimiro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data cleaning completed.
Splitting dataset into train, validation, and test sets...
Dataset split successfully.
Vectorizing text using TF-IDF...
Vectorization completed.
Selecting top features using Chi-Square Test...
Feature selection completed.
Performing Grid Search for optimal hyperparameter C...
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Grid Search completed.
Validation Accuracies for Different C Values:
C: 0.001 -> Validation Accuracy: 0.5169
C: 0.01 -> Validation Accuracy: 0.5169
C: 0.1 -> Validation Accuracy: 0.5167
C: 1 -> Validation Accuracy: 0.5835
C: 10 -> Validation Accuracy: 0.6109

Optimal C value: 10
Evaluating the best model on the test set...

Test Accuracy with optimal C: 0.58

Classification Report:
              precision    recall  f1-score   support

         1.0       0.55      0.43      0.48        97
         2.0       0.23      0.08      0.12        71
         3.0       0.38      0.18      0.25       111
         4.0       0.39      0.35   

0