In [None]:
#!pip install pandas 
#!pip install nltk
!pip install tensorflow

In [14]:
import pandas as pd
import re

# Load the dataset (Replace with actual file path)
df = pd.read_csv("real.csv")  # Change to your file path

# Drop rows where 'Review' is missing
df = df.dropna(subset=['Review'])

# Convert text to lowercase and remove special characters
def clean_text(text):
    if isinstance(text, str):  # Ensure it's a string
        text = text.lower()  # Lowercasing
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
        return text.strip()
    return ""

# Apply cleaning
df['cleaned_review'] = df['Review'].apply(clean_text)

# Show first few rows to verify
print(df[['Review', 'cleaned_review']].head())


                                              Review  \
0        there is no way back, enjoy what you have .   
1   1st 95 went over 300k before being totalled b...   
2   Sold 86 Toyota Van 285K miles to be replaced ...   
3   I have owned lots of vans, and the Previa is ...   
4   My 1997 AWD Previa is the third one that I ha...   

                                      cleaned_review  
0           there is no way back enjoy what you have  
1  st  went over k before being totalled by a tru...  
2  sold  toyota van k miles to be replaced with  ...  
3  i have owned lots of vans and the previa is fa...  
4  my  awd previa is the third one that i have ow...  


In [15]:
# we have downloaded the ntlk lib from that now we are installing stopwords and lemmatizier
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [18]:
# we are intializing the lemmatizer and stopwords now 
stop_words=set(stopwords.words('english'))
stop_words
# initializing the lemmatizer 
lemmatizer=WordNetLemmatizer()

def preprocess_text(text):
    words=text.split()  # this splits the sentences into words
    words=[lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

df['processed_review'] = df['cleaned_review'].apply(preprocess_text)
print(df[['cleaned_review', 'processed_review']].head())



                                      cleaned_review  \
0           there is no way back enjoy what you have   
1  st  went over k before being totalled by a tru...   
2  sold  toyota van k miles to be replaced with  ...   
3  i have owned lots of vans and the previa is fa...   
4  my  awd previa is the third one that i have ow...   

                                    processed_review  
0           there is no way back enjoy what you have  
1  st went over k before being totalled by a truc...  
2  sold toyota van k mile to be replaced with pre...  
3  i have owned lot of van and the previa is far ...  
4  my awd previa is the third one that i have own...  


In [19]:
from textblob import TextBlob

# Function to get sentiment category
def get_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity  # Ranges from -1 (negative) to +1 (positive)
    if polarity < -0.1:
        return "Frustrated"
    elif -0.1 <= polarity < 0.1:
        return "Neutral"
    else:
        return "Positive"

# Apply sentiment analysis
df['sentiment'] = df['processed_review'].apply(get_sentiment)

# Show the first few results
print(df[['processed_review', 'sentiment']].head())


                                    processed_review sentiment
0           there is no way back enjoy what you have  Positive
1  st went over k before being totalled by a truc...  Positive
2  sold toyota van k mile to be replaced with pre...  Positive
3  i have owned lot of van and the previa is far ...   Neutral
4  my awd previa is the third one that i have own...   Neutral


In [28]:
df.groupby("sentiment").count()

Unnamed: 0_level_0,Column1,Review_Date,Author_Name,Vehicle_Title,Review_Title,Review,Rating,cleaned_review,processed_review
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Frustrated,832,832,832,832,831,832,789,832,832
Neutral,4601,4601,4601,4601,4600,4601,4326,4601,4601
Positive,13314,13314,13314,13314,13314,13314,12842,13314,13314


In [29]:
df.groupby("sentiment")['processed_review'].apply(list)


sentiment
Frustrated    [cant go wrong with this car they thought of e...
Neutral       [i have owned lot of van and the previa is far...
Positive      [there is no way back enjoy what you have, st ...
Name: processed_review, dtype: object

# categorizing

### **Step 3: Categorizing Reviews into Service, Parts, and Others**  
Now, we will classify reviews into three categories:  
1️⃣ **Service-related** (e.g., repair, maintenance)  
2️⃣ **Parts-related** (e.g., engine, battery)  
3️⃣ **Others** (everything else)  

---

### **📝 Steps to Implement**  
✅ Define **keyword lists** for Service & Parts.  
✅ Check if a review contains **any keyword** from these lists.  
✅ Assign a category based on **matched keywords**.  
✅ Store the category in a new column called `"category"`.  

---



### **📌 What This Does?**
- Checks if **service-related** words exist → Assigns **"Service"**  
- Checks if **parts-related** words exist → Assigns **"Parts"**  
- If neither → Assigns **"Others"**  

---

### **🔍 Next Step: Find New Frequent Words in "Others"**
Once we categorize the existing reviews, we can check **what words are commonly appearing in "Others"** (in case there are new complaints that need a new category).



In [32]:
# Define Keywords for Classification
service_keywords = ["repair", "maintenance", "delay", "service", "issue", "problem", "technician"]
parts_keywords = ["engine", "battery", "brake", "wheels", "tyre", "oil", "transmission"]

# Function to Assign Categories
def categorize_review(text):
    text = text.lower()  # Convert to lowercase
    if any(word in text for word in service_keywords):
        return "Service"
    elif any(word in text for word in parts_keywords):
        return "Parts"
    else:
        return "Others"

# Apply the Function to Categorize Reviews
df['category'] = df['processed_review'].apply(categorize_review)

# ✅ Check Category Distribution
df['category'].value_counts()

category
Others     9571
Service    6496
Parts      2680
Name: count, dtype: int64

### **Step 4: Identifying New Frequent Words in "Others" Category**  
Since some reviews are classified as **"Others"**, we should check for **frequent words** in them. This will help us identify:  
✅ **New complaint trends** (e.g., a recurring issue with a new car part).  
✅ **Missing keywords** that should be added to the "Service" or "Parts" category.  
✅ **Potential new categories** if a large number of reviews mention the same issue.  

---

### **📌 What This Does?**
1️⃣ Filters out only reviews in the `"Others"` category.  
2️⃣ Splits reviews into individual words.  
3️⃣ Counts the most **frequent** words.  
4️⃣ Shows the **top 20 words** appearing in `"Others"` reviews.  

---

### **🔍 Next Step: Analyze Results**
- If certain words appear **frequently**, we can **add them** to the `service_keywords` or `parts_keywords` list.  
- If a **new issue** emerges, we might need a **new category**.  

Run this and **share the top words** so we can refine the classification! 🚀

In [38]:
from collections import Counter

# Filter "Others" category reviews
others_reviews = df[df['category'] == "Others"]['processed_review']

# Tokenize words
all_words = " ".join(others_reviews).split()

# Get the most common words
word_counts = Counter(all_words)
common_words = word_counts.most_common(70)  # Get top 20 words

# Convert to DataFrame for better visualization
common_words_df = pd.DataFrame(common_words, columns=["Word", "Count"])

# Display the top words
common_words_df


Unnamed: 0,Word,Count
0,the,33719
1,a,20997
2,and,20754
3,i,19649
4,it,16049
...,...,...
65,or,1436
66,no,1434
67,am,1413
68,well,1397


In [39]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Encode Category Labels (Service = 0, Parts = 1, Others = 2)
label_encoder = LabelEncoder()
df["category_encoded"] = label_encoder.fit_transform(df["category"])

# Train-Test Split (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(
    df["processed_review"], df["category_encoded"], test_size=0.2, random_state=42, stratify=df["category_encoded"]
)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Initialize Models
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
svm_model = SVC(kernel='linear', probability=True, random_state=42)
nb_model = MultinomialNB()

# Train & Evaluate Models
models = {"Random Forest": rf_model, "SVM": svm_model, "Naïve Bayes": nb_model}
for name, model in models.items():
    print(f"\n🔹 Training {name}...")
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    
    print(f"\n✅ Results for {name}:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))




🔹 Training Random Forest...

✅ Results for Random Forest:
Accuracy: 0.9277
              precision    recall  f1-score   support

      Others       0.95      0.99      0.97      1915
       Parts       0.97      0.60      0.74       536
     Service       0.89      0.97      0.93      1299

    accuracy                           0.93      3750
   macro avg       0.94      0.85      0.88      3750
weighted avg       0.93      0.93      0.92      3750


🔹 Training SVM...

✅ Results for SVM:
Accuracy: 0.9675
              precision    recall  f1-score   support

      Others       0.95      1.00      0.98      1915
       Parts       0.94      0.90      0.92       536
     Service       1.00      0.95      0.97      1299

    accuracy                           0.97      3750
   macro avg       0.97      0.95      0.96      3750
weighted avg       0.97      0.97      0.97      3750


🔹 Training Naïve Bayes...

✅ Results for Naïve Bayes:
Accuracy: 0.7349
              precision    recall 

In [43]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Set parameters
MAX_VOCAB_SIZE = 10000  # Limit vocabulary size
MAX_SEQUENCE_LENGTH = 100  # Max words per review
EMBEDDING_DIM = 100  # Embedding vector size

# Tokenization
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(df["cleaned_review"])

# Convert text to sequences
X = tokenizer.texts_to_sequences(df["cleaned_review"])

# Pad sequences to ensure uniform length
X_padded = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

# Encode sentiment labels
label_encoder = LabelEncoder()
df["sentiment_encoded"] = label_encoder.fit_transform(df["sentiment"])
y = np.array(df["sentiment_encoded"])

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.1, random_state=42, stratify=y)

print("Data Preparation Complete! Ready for Next Step.")


Data Preparation Complete! Ready for Next Step.


**Step 2: Building the BiLSTM Model.**  

### **Step 2: Define the BiLSTM Model**
Here’s what we’ll do:
- Use an **Embedding Layer** to convert words into dense vectors.
- Add a **Bidirectional LSTM Layer** to capture dependencies from both past and future words.
- Use a **Dense Layer** with `softmax` activation for classification.

Run the following code:  


### **What’s Next?**
✅ If this runs fine, we’ll move to **Step 3: Training the Model.**  
Let me know if there are any issues! 🚀

In [46]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

# Define the BiLSTM model
model = Sequential([
    Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
    Bidirectional(LSTM(64, return_sequences=True)),  # BiLSTM Layer
    Dropout(0.3),  # Dropout for regularization
    Bidirectional(LSTM(32)),  # Another BiLSTM Layer
    Dense(32, activation='relu'),  # Fully connected layer
    Dropout(0.2),
    Dense(len(label_encoder.classes_), activation='softmax')  # Output layer
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model summary
model.summary()


In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer

# Define the tokenizer with a vocabulary size
vocab_size = 5000
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")

# ✅ Convert all values to strings and handle NaNs
X_train = X_train.astype(str).tolist() if isinstance(X_train, pd.Series) else [str(x) for x in X_train]
X_test = X_test.astype(str).tolist() if isinstance(X_test, pd.Series) else [str(x) for x in X_test]

# ✅ Fit tokenizer on training data
tokenizer.fit_on_texts(X_train)

# ✅ Convert text into sequences
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

print("✅ Tokenization complete.")


AttributeError: 'int' object has no attribute 'lower'

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define maximum sequence length
max_length = 100  

# Pad the sequences
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post', truncating='post')

print("✅ Padding complete. Shapes:", X_train_padded.shape, X_test_padded.shape)


# Training the Model 



In [47]:
# Train the model
history = model.fit(X_train_padded, y_train, 
                    validation_data=(X_test_padded, y_test), 
                    epochs=5, 
                    batch_size=32)


NameError: name 'X_train_padded' is not defined