In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

try:
    file_path = '/content/nlp.csv'
    df = pd.read_csv(file_path)

    # Display the first few rows of the dataset
    print("First few rows of the dataset:")
    print(df.head())

    # Check the shape of the dataset
    print("\nDataset shape (rows, columns):", df.shape)

    # Check for missing values
    print("\nMissing values in each column:")
    print(df.isnull().sum())

except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
except pd.errors.ParserError:
    print(f"Error: Could not parse the file at {file_path}. Check the file format.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

print("\nMissing values in each column:")
print(df.isnull().sum())

# Filling missing values for certain columns
df['title'].fillna('Unknown Title', inplace=True)
df['imgUrl'].fillna('No Image URL', inplace=True)
df['productURL'].fillna('No Product URL', inplace=True)

# Dropping rows with missing values in essential columns
df.dropna(subset=['stars', 'reviews', 'price', 'listPrice', 'categoryName'], inplace=True)

# Filling missing values for other columns with appropriate defaults
df['asin'].fillna('Unknown ASIN', inplace=True)
df['isBestSeller'].fillna(False, inplace=True)
df['boughtInLastMonth'].fillna(False, inplace=True)

print("\nMissing values after preprocessing:")
print(df.isnull().sum())

In [None]:
df.reset_index(drop=True, inplace=True)

# Remove duplicates
df.drop_duplicates(inplace=True)

In [None]:
# Data type conversion
df['price'] = pd.to_numeric(df['price'], errors='coerce')  # Convert price to numeric
df['listPrice'] = pd.to_numeric(df['listPrice'], errors='coerce')  # Convert listPrice to numeric
df['stars'] = pd.to_numeric(df['stars'], errors='coerce')  # Convert stars to numeric

# Standardizing text data
df['title'] = df['title'].str.lower().str.strip()
df['categoryName'] = df['categoryName'].str.lower().str.strip()
df['asin'] = df['asin'].str.upper().str.strip()  # Assuming ASINs should be uppercase
df['productURL'] = df['productURL'].str.lower().str.strip()
df['imgUrl'] = df['imgUrl'].str.lower().str.strip()

print("\nData types after conversion:")
print(df.dtypes)

print("\nSample data after standardization:")
print(df.head())

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/content/nlp.csv")

# Feature Engineering: Length of the title
df['TitleLength'] = df['title'].apply(len)  # Assuming we calculate length for the 'title' column

# Outlier detection and treatment using IQR for 'reviews' (assuming 'reviews' represents the number of reviews)
Q1 = df['reviews'].quantile(0.25)
Q3 = df['reviews'].quantile(0.75)
IQR = Q3 - Q1

# Filter rows within the IQR range for 'reviews'
df = df[(df['reviews'] >= (Q1 - 1.5 * IQR)) & (df['reviews'] <= (Q3 + 1.5 * IQR))]

print("Feature engineering and outlier treatment complete.")
print(df.head())

In [None]:
from sklearn.preprocessing import StandardScaler
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Normalization or Standardization of numerical features
scaler = StandardScaler()
df[['stars', 'reviews', 'price', 'listPrice']] = scaler.fit_transform(df[['stars', 'reviews', 'price', 'listPrice']])

# Text Preprocessing for NLP
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Example text preprocessing for 'title' column
df['title'] = df['title'].str.lower().str.split()  # Convert to lowercase and split into words
df['title'] = df['title'].apply(lambda x: [word for word in x if word not in stop_words])  # Remove stopwords
df['title'] = df['title'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])  # Lemmatize words
df['title'] = df['title'].apply(lambda x: ' '.join(x))  # Join words back into a single string


In [None]:
def preprocess_text(text):
    # Tokenize the text
    tokens = text.split()
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join tokens back into a single string
    return ' '.join(tokens)

# Applying the function to the 'title' column (or another text column if needed)
df['title'] = df['title'].apply(preprocess_text)

In [None]:
# Splitting the dataset
from sklearn.model_selection import train_test_split

# Assuming 'stars' is the target variable and other columns are features
X = df.drop('stars', axis=1)  # Features
y = df['stars']  # Target variable

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display shapes and first few rows
print("\nTraining set shape (rows, columns):", X_train.shape)
print("Testing set shape (rows, columns):", X_test.shape)
print("\nFirst few rows of the processed training dataset:")
print(X_train.head())

In [None]:
import pandas as pd
from textblob import TextBlob  # TextBlob for performing NLP tasks and sentiment analysis
import numpy as np

# Sentiment Analysis
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity  # Returns a sentiment score between -1 and 1

# Apply sentiment analysis to the 'title' column (assuming it's the primary text column)
df['SentimentScore'] = df['title'].apply(get_sentiment)

# Helpfulness Ratio
# Replace HelpfulnessNumerator and HelpfulnessDenominator with 'reviews' (numerator) and 'stars' (denominator)
df['HelpfulnessRatio'] = df['reviews'] / df['stars'].replace(0, np.nan)  # Avoid division by zero

In [None]:
df['SentimentScore'] = df['title'].apply(get_sentiment)  # Apply sentiment analysis to the 'title' column

# Helpfulness Ratio
df['HelpfulnessRatio'] = df['reviews'] / df['stars'].replace(0, np.nan)  # Calculate helpfulness ratio (assuming reviews and stars)

# Check if the required columns exist in the dataset
if all(col in df.columns for col in ['asin', 'title', 'reviews', 'stars', 'categoryName', 'isBestSeller']):

    # User Profiles aggregation (grouping by 'asin' as the unique product identifier)
    user_profiles = df.groupby('asin').agg(
        AverageRating=('stars', 'mean'),
        PreferredCategories=('categoryName', lambda x: x.mode()[0] if not x.mode().empty else np.nan),  # Most common categoryName
        TotalReviews=('reviews', 'sum')
    ).reset_index()

    # Product Profiles aggregation
    product_profiles = df.groupby('asin').agg(
        AverageScore=('stars', 'mean'),
        TotalReviews=('reviews', 'sum'),
        AverageSentimentScore=('SentimentScore', 'mean'),  # Average sentiment score
        AverageHelpfulnessRatio=('HelpfulnessRatio', 'mean')  # Average helpfulness ratio
    ).reset_index()

    # Display the user profiles
    print("User Profiles:")
    print(user_profiles.head())

    # Display the product profiles
    print("\nProduct Profiles:")
    print(product_profiles.head())
else:
    print("The required columns are not present in the dataset.")

In [None]:
pip install scikit-surprise

In [None]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Assuming 'df' contains the necessary columns:
# 'asin' as ProductId, 'stars' as the rating score, and 'UserId' for user interaction.

# Collaborative Filtering
reader = Reader(rating_scale=(1, 5))  # Rating scale
data = Dataset.load_from_df(df[['title', 'asin', 'stars']], reader)

# Split the dataset into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2)

# Initialize SVD model
model = SVD()

# Train the model
model.fit(trainset)

# Make predictions
predictions = model.test(testset)

# Calculate RMSE
rmse = accuracy.rmse(predictions)
print(f'RMSE of SVD: {rmse}')

In [None]:
#Content-Based Filtering using TF-IDF and Cosine Similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
df = pd.read_csv("/content/nlp.csv")

# Assuming 'title' or 'reviews' contains the text data for content-based filtering
# You can choose one of these columns for product descriptions or reviews

# Using 'title' for product description (you can also use 'reviews' if that's more suitable)
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['title'])  # or use df['reviews'] if needed

# Now, tfidf_matrix contains the TF-IDF representation of the product titles or reviews.


In [None]:
# Compute cosine similarity matrix
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
df=pd.read_csv("/content/nlp.csv");
# Function to get recommendations based on cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def get_content_based_recommendations(product_name):
    # Check if the product_name exists in the DataFrame
    if product_name not in df['title'].values:
        raise ValueError(f"Product name '{product_name}' not found in the DataFrame.")

    # Get the index of the product
    try:
        idx = df.index[df['title'] == product_name][0]
    except IndexError:
        raise ValueError(f"Product name '{product_name}' not found in the DataFrame index.")

    # Get the pairwise similarity scores of all products with that product
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the products based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar products
    sim_scores = sim_scores[1:min(11, len(sim_scores))]

    # Get the product indices, ensuring they are within the DataFrame bounds
    product_indices = [i[0] for i in sim_scores if 0 <= i[0] < len(df)]

    # Return the recommendations, or an empty DataFrame if no similar products are found
    if not product_indices:
        return pd.DataFrame()  # Return an empty DataFrame if no valid indices

    return df.iloc[product_indices]

# Example usage
product_name = 'Disney Princess E0274 Royal Shimmer Belle Doll'  # Replace with a valid product title from your DataFrame
try:
    recommended_products = get_content_based_recommendations(product_name=product_name)
    print("Content-Based Recommendations:")
    print(recommended_products[['asin', 'title']])  # Assuming 'title' is the product name column
except ValueError as e:
    print(e)
except IndexError as e:
    print("Index error encountered:", e)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def get_content_based_recommendations(product_name):
    # Check if the product_name exists in the DataFrame
    if product_name not in df['title'].values:
        raise ValueError(f"Product name '{product_name}' not found in the DataFrame.")

    # Get the index of the product
    try:
        idx = df.index[df['title'] == product_name][0]
    except IndexError:
        raise ValueError(f"Product name '{product_name}' not found in the DataFrame index.")

    # Get the pairwise similarity scores of all products with that product
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the products based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top 10 most similar products (excluding the input product itself)
    sim_scores = sim_scores[1:min(11, len(sim_scores))]

    # Get the product indices
    product_indices = [i[0] for i in sim_scores if 0 <= i[0] < len(df)]

    # Filter based on reviews, stars, and price to recommend the top product
    recommendations = df.iloc[product_indices].copy()  # Ensure we're working with a copy

    # Calculate the weighted score for each product
    recommendations['weighted_score'] = (
        recommendations['stars'] * 0.5 +  # Star rating weight
        recommendations['reviews'] * 0.3 +  # Review count weight
        (1 / recommendations['price']) * 0.2  # Price weight (assuming lower price is better)
    )

    # Sort by the weighted score and return the top 1 product
    best_recommendation = recommendations.sort_values(by='weighted_score', ascending=False).iloc[0]

    return best_recommendation

# Example usage
product_name = 'Disney Princess E0274 Royal Shimmer Belle Doll'  # Replace with a valid product title
try:
    recommended_product = get_content_based_recommendations(product_name=product_name)
    print("Top Content-Based Recommendation:")
    print(recommended_product[['asin', 'title']])  # Display the top recommended product
except ValueError as e:
    print(e)
except IndexError as e:
    print("Index error encountered:", e)