In [1]:
from IPython.display import display, HTML

display(HTML("""
<h2 style='color:#2c3e50;'>Customer Sentiment Analysis</h2>

<h3 style='color:#2c3e50;'>Objective</h3>

<p style='font-size:15px; line-height:1.7; text-align:justify;'>
As a Data Analyst at Amazon, your task is to gauge customer sentiment towards the 
<b>iPhone 15 (128GB)</b> model. This project involves analyzing public perception and evaluating 
customer reactions by conducting sentiment analysis on user-posted product reviews.
</p>

<p style='font-size:15px; line-height:1.7; text-align:justify;'>
By extracting and processing customer reviews, the goal is to understand whether sentiment is 
<b>positive or negative</b>. These insights can assist in better decision-making, improving customer 
experience, and identifying key areas for product enhancement.
</p>
"""))


In [2]:
from IPython.display import display, HTML

display(HTML("""
<h2 style='color:#2c3e50;'>Libraries and Tools</h2>

<p style='font-size:15px; line-height:1.7; text-align:justify;'>
This project uses a combination of web scraping, data analysis, and sentiment analysis tools:
</p>

<ul style='font-size:15px; line-height:1.7;'>
    <li><b>Selenium:</b> Automates the process of scraping reviews from dynamic websites.</li>
    <li><b>BeautifulSoup:</b> Parses HTML content and extracts structured review data.</li>
    <li><b>Pandas:</b> Cleans, processes, and analyzes the scraped data.</li>
    <li><b>TextBlob:</b> Performs sentiment analysis to classify review polarity.</li>
    <li><b>Matplotlib/Seaborn:</b> Creates visualizations such as sentiment distribution, bar charts, and word clouds.</li>
</ul>
"""))


In [3]:
from IPython.display import display, HTML

display(HTML("""
<h2 style='color:#2c3e50;'>Tasks</h2>

<h3 style='color:#2c3e50;'>1. Data Collection (Web Scraping)</h3>

<p style='font-size:15px; line-height:1.7; text-align:justify;'>
<b>Tools:</b> Selenium, BeautifulSoup
</p>

<p style='font-size:15px; line-height:1.7; text-align:justify;'>
The task is to scrape at least <b>300 customer reviews</b> from Flipkart's product page for the
<b>iPhone 15 (128GB)</b> model. Each review should contain the username, rating (1–5 stars),
and the review text describing the customer's experience.
</p>

<h4 style='color:#2c3e50;'>Steps:</h4>

<ul style='font-size:15px; line-height:1.7;'>
    <li>Use <b>Selenium</b> to automate browser actions, navigate to the Flipkart page, and extract reviews.</li>
    <li>Use <b>BeautifulSoup</b> to parse HTML and extract username, rating, and review text from each review block.</li>
    <li>Handle <b>pagination</b> to collect reviews from multiple pages until 300+ reviews are scraped.</li>
</ul>
"""))


In [4]:
# Import the necessary librariess
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

# Create empty lists to store the user data such as Name, City, Date of Purchase, Review & Rating
Names = []
Cities = []
Dates = []
Reviews = []
Ratings = []

# Assign the url of the flipkart website and use selenium to scrape data
url = """https://www.flipkart.com/apple-iphone-15-blue-128-gb/product-reviews/itmbf14ef54f645d?pid=MOBGTAGPAQNVFZZY&lid=LSTMOBGTAGPAQNVFZZYQRLPCQ&marketplace=FLIPKART"""
driver = webdriver.Edge()
driver.get(url)


while len(Names) < 320:

    time.sleep(2)
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Extract names
    names_elements= soup.find_all("p", {"class": "_2NsDsF AwS1CA"})
    for name in  names_elements:
        Names.append(name.text)

    # Extract cities
    city_elements = soup.find_all("p", {"class": "MztJPv"}) 
    for city in city_elements:
        Cities.append(city.text)

    # Extract dates
    dates_elements = soup.find_all("p", {"class": "_2NsDsF"}) 
    for date in dates_elements:
        Dates.append(date.text)
    Actual_Dates = Dates[1::2]

    # Extract reviews
    reviews_elements = soup.find_all("div", {"class": "ZmyHeo"})
    for review in reviews_elements:
        Reviews.append(review.text)

    # Extract ratings
    ratings_elements = soup.find_all("div", class_ = "XQDdHH Ga3i8K")
    for ratings in ratings_elements:
        Ratings.append(ratings.text)

    # Try to click the "Next" button
    try:
        next_button = driver.find_element(By.XPATH, "//span[text()='Next']")
        next_button.click()
        time.sleep(5)
    except:
        break

# Combine data into a DataFrame
df = pd.DataFrame({
    "Name": Names[:-1],
    "City": Cities[:-1],
    "Date": Actual_Dates[:-1],
    "Review": Reviews[:-1],
    "Ratings": Ratings
})
 

ValueError: All arrays must be of the same length

In [None]:
from IPython.display import display, HTML

display(HTML("""
<h2 style='color:#2c3e50;'>2. Data Cleaning and Preprocessing</h2>

<p style='font-size:15px; line-height:1.7; text-align:justify;'>
<b>Tool:</b> Pandas
</p>

<p style='font-size:15px; line-height:1.7; text-align:justify;'>
<b>Task:</b> Clean and preprocess the scraped customer review data to prepare it for further analysis and sentiment evaluation.
</p>

<h3 style='color:#2c3e50;'>Steps:</h3>

<ul style='font-size:15px; line-height:1.7;'>
    <li><b>Remove duplicates:</b> Delete any repeated review entries to ensure high-quality data.</li>
    <li><b>Handle missing values:</b> Address missing ratings or empty review text by removing or imputing values appropriately.</li>
    <li><b>Text preprocessing:</b>
        <ul>
            <li>Convert all text to lowercase.</li>
            <li>Remove special characters, punctuation, and unnecessary spaces.</li>
            <li>Tokenize the text into individual words.</li>
            <li>Remove stop words that do not add analytical value.</li>
            <li>Apply lemmatization to reduce words to their base forms (e.g., "running" → "run").</li>
        </ul>
    </li>
</ul>
"""))


In [None]:
# Check the basic info of the dataframe
df.info()
# Drop the duplicates from the dataframe
df1 = df.copy()
df1 = df1.drop_duplicates()
df1

In [None]:
# Convert the Name column data into Title Case
df1['Name'] = df1['Name'].str.title()
df1.head()

In [None]:
# Clean data of City column by removing unwanted characters/ part of string
df1['City'] = df1['City'].str.replace("Certified Buyer, ", "", regex=False).str.strip()
df1.head()

In [None]:
# Clean data of Review column by removing unwanted characters/ part of string and converting to lowercase
df1['Review'] = df1['Review'].str.lower().str.replace("read more", "", regex=False)
df1head()

In [None]:
from IPython.display import display, HTML

display(HTML("""
<h2 style='color:#2c3e50;'>3. Sentiment Analysis</h2>

<p style='font-size:15px; line-height:1.7; text-align:justify;'>
<b>Tool:</b> TextBlob
</p>

<p style='font-size:15px; line-height:1.7; text-align:justify;'>
<b>Task:</b> Analyze the sentiment of each customer review and categorize it as positive or negative.
</p>

<h3 style='color:#2c3e50;'>Steps:</h3>

<ul style='font-size:15px; line-height:1.7;'>
    <li>Apply <b>TextBlob</b> sentiment analysis to the review text.</li>
    <li>TextBlob returns:
        <ul>
            <li><b>Polarity score</b> ranging from -1 (negative) to +1 (positive)</li>
            <li><b>Subjectivity score</b> ranging from 0 (objective) to 1 (subjective)</li>
        </ul>
    </li>
    <li>Define sentiment classification thresholds:
        <ul>
            <li><b>Positive:</b> Polarity ≥ 0.1</li>
            <li><b>Negative:</b> Polarity &lt; 0.1</li>
        </ul>
    </li>
    <li>Store the sentiment category for each review in the dataset.</li>
</ul>
"""))


In [None]:
# Import libraries for Sentimental analysis of review sentences 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from textblob import TextBlob
import string

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
# Create a column called Reviews_t that stores tokenized sentences from the Review column using the sent_tokenize function.
df1["Reviews_t"] = df1['Review'].apply(sent_tokenize)
df1

In [None]:
# Import mean from statistics for basic statistics
from statistics import mean

# Function created for assigning Polarity to the Reviews_t column
def get_polarity(sentences):
    return [TextBlob(sentence).sentiment.polarity for sentence in sentences]

# Calls get_polarity function on the Reviews_t column to assign polarity
df1['Polarity'] = df1['Reviews_t'].apply(get_polarity)

# Function created to calculate the average polarity of each review (Average of polarity for each sentences in a review)
def calculate_average_polarity(polarities):
    return mean(polarities) if polarities else 0

# Calls calculate_average_polarity function on the Polarity column to assign the average polarity for each review
df1['Average_Polarity'] = df1['Polarity'].apply(calculate_average_polarity)
df1['Average_Polarity'] = df1['Average_Polarity'].round(2)
df1.head(10)

In [None]:
# Function to assign the Class to the Polarity
def sentiment_class(polarity):
    if polarity > 0.75:
        return 'extremely positive'
    elif 0 < polarity <= 0.75:
        return 'positive'
    elif polarity == 0:
        return 'neutral'
    elif -0.75 <= polarity < 0:
        return 'negative'
    else:
        return 'extremely negative'

# Calls sentiment_class function on the Average_Polarit column to assign the sentiment class
df1['Sentiment_Class'] = df1['Average_Polarity'].apply(sentiment_class)

In [None]:
df1.head()

In [None]:
# Calculates and prints the overall average polarity score of the entire dataset of reviews
polarity_score = df1['Average_Polarity'].mean().round(2)
print(f'Average Polarity Score : {polarity_score}')
if polarity_score > 0.75:
        print('The Average Polarity Score is Extremely Positive')
elif 0 < polarity_score <= 0.75:
    print('The Average Polarity Score is Positive')
elif polarity_score == 0:
    print('The Average Polarity Score is Neutral')
elif -0.75 <= polarity_score < 0:
    print('The Average Polarity Score is Negative')
else:
    print('The Average Polarity Score is Extremely Negative')

In [5]:
from IPython.display import display, HTML

display(HTML("""
<h2 style='color:#2c3e50;'>4. Data Analysis and Insights</h2>

<p style='font-size:15px; line-height:1.7; text-align:justify;'>
<b>Tools:</b> Pandas, Matplotlib, Seaborn
</p>

<p style='font-size:15px; line-height:1.7; text-align:justify;'>
<b>Task:</b> Analyze the sentiment of customer reviews and extract actionable insights
that help understand user perceptions and product experience.
</p>

<h3 style='color:#2c3e50;'>Steps:</h3>

<ul style='font-size:15px; line-height:1.7;'>
    <li><b>Sentiment Distribution:</b> Compute the overall distribution of positive and negative sentiments from the scraped reviews.</li>
    <li><b>Average Rating vs Sentiment:</b> Investigate whether higher star ratings correspond to higher polarity sentiment scores.</li>
    <li><b>Word Cloud:</b> Create separate word clouds for positive and negative reviews to identify the most commonly used words.</li>
    <li><b>Review Length Analysis:</b> Examine whether longer reviews are linked to deeper or more emotional sentiments.</li>
</ul>
"""))


In [None]:
# Imports libraries for visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Plots figure for Sentiment Distribution based on Sentiment Category
plt.figure(figsize=(10, 6))
sns.histplot(x=new_df1.Sentiment_Class, color='green')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment Category')
plt.ylabel('Frequency')
plt.xticks(rotation=0)
plt.show()

In [6]:
from IPython.display import display, HTML

display(HTML("""
<h2 style='color:#2c3e50;'>Sentiment Distribution</h2>

<p style='font-size:15px; line-height:1.7; text-align:justify;'>
The bar chart illustrates how different sentiment categories are distributed across the dataset.
The <b>x-axis</b> shows the sentiment categories, while the <b>y-axis</b> represents their frequency.
</p>

<ul style='font-size:15px; line-height:1.7;'>
    <li><b>Positive:</b> The most frequent sentiment with over 200 instances.</li>
    <li><b>Extremely Positive:</b> The second most occurring category, but far less common than "Positive".</li>
    <li><b>Neutral:</b> Appears less frequently than both positive categories.</li>
    <li><b>Negative:</b> The least frequent sentiment in the dataset.</li>
</ul>

<p style='font-size:15px; line-height:1.7; text-align:justify;'>
Overall, the distribution highlights a strong dominance of <b>positive sentiments</b>, with "Positive"
leading significantly, followed by "Extremely Positive". Neutral and negative sentiments occur
at much lower rates.
</p>
"""))


In [None]:
df1['Review_Length'] = df1['Review'].apply(lambda x: len(x.split()))

In [None]:
# Box Plot for Review Length by Sentiment
plt.figure(figsize=(8, 6))
sns.boxplot(x='Sentiment_Class', y='Review_Length', data=df1, hue = 'Sentiment_Class', palette='Set2')
plt.title('Review Length vs Sentiment', fontsize=14)
plt.xlabel('Sentiment', fontsize=12)
plt.ylabel('Review Length (Number of Words)', fontsize=12)
plt.show()

In [7]:
from IPython.display import display, HTML

display(HTML("""
<h2 style='color:#2c3e50;'>Review Length vs Sentiment</h2>

<h3 style='color:#2c3e50;'>Correlation</h3>
<p style='font-size:15px; line-height:1.7; text-align:justify;'>
Reviews with higher positive sentiment generally correspond to <b>higher star ratings</b> (4.5–5 stars). 
This is reflected in the clustering patterns and color gradients in the visualization.
</p>

<h3 style='color:#2c3e50;'>Neutral Reviews</h3>
<p style='font-size:15px; line-height:1.7; text-align:justify;'>
Neutral reviews appear across a wide range of ratings, indicating that <b>neutral sentiment does not always 
match the numeric rating</b> provided by users.
</p>

<h3 style='color:#2c3e50;'>Negative Reviews</h3>
<p style='font-size:15px; line-height:1.7; text-align:justify;'>
Negative and extremely negative reviews typically align with <b>lower ratings</b>, although some differences 
can occur due to individual reviewer perspectives and subjective interpretations.
</p>
"""))


In [None]:
# Plotting ratings vs average polarity
plt.figure(figsize=(10, 6))
sns.boxplot(x='Average_Polarity', y='Ratings', data = df1, hue = 'Average_Polarity', palette='coolwarm')
plt.title('Ratings vs Average Polarity')
plt.xlabel('Average Polarity')
plt.ylabel('Ratings')
plt.xticks(rotation=90)
plt.show()

In [8]:
from IPython.display import display, HTML

display(HTML("""
<h2 style='color:#2c3e50;'>Ratings vs Average Polarity</h2>

<h3 style='color:#2c3e50;'>Positive Sentiment</h3>
<p style='font-size:15px; line-height:1.7; text-align:justify;'>
Shows the widest variation in review length with notable outliers, and has a higher median
length than any other sentiment group.
</p>

<h3 style='color:#2c3e50;'>Extremely Positive Sentiment</h3>
<p style='font-size:15px; line-height:1.7; text-align:justify;'>
Reviews are generally short with a tight distribution and very few outliers.
</p>

<h3 style='color:#2c3e50;'>Neutral Sentiment</h3>
<p style='font-size:15px; line-height:1.7; text-align:justify;'>
Displays a narrow range of review lengths, closely resembling the “Extremely Positive” group.
</p>

<h3 style='color:#2c3e50;'>Negative Sentiment</h3>
<p style='font-size:15px; line-height:1.7; text-align:justify;'>
Shows a moderate spread in review length. The median length is shorter than Positive but longer
than Extremely Positive and Neutral sentiments.
</p>

<h3 style='color:#2c3e50;'>Interpretation</h3>
<p style='font-size:15px; line-height:1.7; text-align:justify;'>
Positive reviews tend to be more detailed and longer in length. Extremely Positive and Neutral
reviews are typically short, while Negative reviews vary but are generally more concise than
Positive reviews. This suggests that satisfied users often elaborate more on their experiences.
</p>
"""))


In [9]:
from IPython.display import display, HTML

display(HTML("""
<div style='font-family:Arial; line-height:1.7;'>
<h1 style='color:#2c3e50;'>Sentiment Analysis Report — Customer Reviews for iPhone 15 (128GB) on Flipkart</h1>

<h2>1. Overview</h2>
<p>This report summarizes insights obtained from sentiment analysis on customer reviews of the iPhone 15 (128GB) from Flipkart. Over 300 reviews were scraped, cleaned, and analyzed to identify customer perceptions, key trends, strengths, and areas of improvement.</p>

<h2>2. Data Collection & Cleaning Summary</h2>
<h3>Data Collection</h3>
<ul>
<li>Scraped using Selenium</li>
<li>Parsed using BeautifulSoup</li>
<li>Collected Username, Rating, Review Text, City, Purchase Date</li>
</ul>

<h3>Data Cleaning</h3>
<ul>
<li>Removed duplicates</li>
<li>Handled missing values</li>
<li>Text preprocessing: lowercasing, punctuation removal, tokenization, stopword removal, lemmatization</li>
</ul>

<h2>3. Sentiment Analysis Results</h2>
<p>TextBlob provided polarity (–1 to +1) and subjectivity (0–1).</p>
<ul>
<li><b>Positive:</b> ≥ 0.1</li>
<li><b>Negative:</b> &lt; 0.1</li>
</ul>

<h3>Sentiment Distribution</h3>
<ul>
<li>Positive: Most frequent</li>
<li>Extremely Positive: Second most frequent</li>
<li>Neutral: Moderate</li>
<li>Negative: Least</li>
</ul>
<p><b>Conclusion:</b> Customer sentiment is overwhelmingly positive.</p>

<h2>4. Insights From Sentiment Analysis</h2>
<h3>A. Ratings vs Sentiment</h3>
<ul>
<li>Positive sentiment aligns with 4–5 star ratings</li>
<li>Neutral spans 3–4 stars</li>
<li>Negative aligns with 1–2 stars</li>
</ul>

<h3>B. Review Length vs Sentiment</h3>
<ul>
<li>Positive reviews are longer and detailed</li>
<li>Extremely Positive & Neutral reviews are short</li>
<li>Negative reviews are concise</li>
</ul>

<h3>C. Word Cloud Insights</h3>
<ul>
<li><b>Positive:</b> camera, performance, battery, design</li>
<li><b>Negative:</b> overheating, price, battery drain, slow delivery</li>
</ul>

<h2>5. Key Findings & Trends</h2>
<h3>Positive Highlights</h3>
<ul>
<li>Camera quality</li>
<li>Smooth UI</li>
<li>Premium design</li>
<li>Good battery life</li>
<li>Brand trust</li>
</ul>

<h3>Common Issues</h3>
<ul>
<li>Heating problems</li>
<li>High price</li>
<li>Battery complaint (minor)</li>
<li>Delivery delays</li>
</ul>

<h2>6. Recommendations</h2>
<h3>A. For Apple</h3>
<ul>
<li>Optimize heating</li>
<li>Improve battery performance</li>
<li>Offer promotional value</li>
</ul>

<h3>B. For Flipkart</h3>
<ul>
<li>Improve delivery speed</li>
<li>Highlight product strengths</li>
<li>Clarify return/replacement policies</li>
</ul>

<h2>7. Conclusion</h2>
<p>Overall sentiment is <b>highly positive</b>.  
Customers praise performance and design but mention heating and price as concerns.  
Insights help Apple improve product quality and help Flipkart enhance customer experience and marketing.</p>
</div>
"""))
