In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings 
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("bike.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# 1 . What is the range of selling prices in the dataset?
maxp = df['selling_price'].max()
minp = df['selling_price'].min()
rangep = maxp - minp
print(f'The range of selling prices in the dataset is {rangep}')

In [None]:
# 2.  What is the median selling price for bikes in the dataset?
medianp = df['selling_price'].median()
print(f"The median selling price for bikes in the dataset is {medianp}")

In [None]:
# 3. What is the most common seller type?
common = df['seller_type'].mode()[0]
print(f'The most common seller type is {common}')

In [None]:
#  4. How many bikes have driven more than 50,000 kilometers?
count = (df['km_driven'] > 50000).sum()
print(f"{count} bikes have driven more than 50,000 kilometers")

In [None]:
# 5. What is the average km_driven value for each ownership type?
average = df.groupby('owner')['km_driven'].mean()
print(average)

In [None]:
# 6.  What proportion of bikes are from the year 2015 or older?
proportion = (df['year'] <= 2015).mean()*100
print(f"{proportion: .1f}% of bikes are from the year 2015 or older.")

In [None]:
# 7.  What is the trend of missing values across the dataset?
df.isnull().sum()

In [None]:
df["ex_showroom_price"].fillna(df["ex_showroom_price"].median(), inplace=True)

In [None]:
# 8. What is the highest ex_showroom_price recorded, and for which bike?
highest = df.groupby("name", as_index=False)["ex_showroom_price"].max()
top_bike = highest.sort_values("ex_showroom_price", ascending=False).head(1)
print(top_bike)

In [None]:
# 9. What is the total number of bikes listed by each seller type?
df['seller_type'].value_counts()

In [None]:
# 10.What is the relationship between selling_price and km_driven for first-owner bikes?
plt.figure(figsize=(12,8))
first_owner = df[df['owner'] == '1st owner']
sns.scatterplot(x = 'km_driven',y = 'selling_price', data = first_owner, palette='Set2')
plt.xlabel('KM Driven')
plt.ylabel('Selling Price')
plt.title('km driven vs selling price for 1st owner bike')
plt.show()

In [None]:
# 11.  Identify and remove outliers in the km_driven column using the IQR method
sns.boxplot(df['km_driven'])

In [None]:
# removing outliers

Q1 = df["km_driven"].quantile(0.25)
Q3 = df["km_driven"].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print("Lower Bound:", lower_bound)
print("Upper Bound:", upper_bound)

df_no_outliers = df[(df["km_driven"] >= lower_bound) & (df["km_driven"] <= upper_bound)]

print("Original dataset size:", df.shape[0])
print("After removing outliers:", df_no_outliers.shape[0])


In [None]:
 # 12. Perform a bivariate analysis to visualize the relationship between year and selling_price
plt.figure(figsize=(7,7))
sns.scatterplot(x ='year', y = 'selling_price', data = df, palette ='coolwarm')
plt.xlabel('Year')
plt.ylabel('Selling Price')
plt.title('Bivariate Analysis of Year vs Selling Price')
plt.show()

In [None]:
#13.What is the average depreciation in selling price based on the bike's age(current year-manufacturing year)?
df["bike_age"] = 2026 - df["year"]
avg_price_by_age = df.groupby("bike_age")["selling_price"].mean().reset_index()
print(avg_price_by_age)

In [None]:
# 14.  Which bike names are priced significantly above the average price for their manufacturing year?

year_avg = df.groupby("year")["selling_price"].mean()
df["year_avg"] = df["year"].map(year_avg)
above_avg = df[df["selling_price"] > df["year_avg"]]
print(above_avg[["name", "year", "selling_price", "year_avg"]])

In [None]:
# 15. Develop a correlation matrix for numeric columns and visualize it using a heatmap.
corr = df.corr(numeric_only=True) 
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, cmap="Spectral", fmt=".2f", linewidths=0.5) 
plt.title("Correlation Matrix Heatmap") 
plt.show()

In [None]:
cdf = pd.read_csv("Car Sale.csv")
cdf.head()

In [None]:
cdf.info()

In [None]:
# 1. What is the average selling price of cars for each dealer, and how does it compare across different dealers?
average = cdf.groupby('Dealer_Name')['Price ($)'].mean().sort_values(ascending=False)
print(average)
# U-Haul CO sell cars at highest price 
# Buddy Storbeck's Diesel Service Inc sell cars at lowest price


In [None]:
# 2. Which car brand (Company) has the highest variation in prices, and what does this tell us about the pricing trends?
variation = cdf.groupby('Company')['Price ($)'].std().sort_values(ascending=False)
print(variation)
# Lincoln sells budget friendly 

In [None]:
# 3. What is the distribution of car prices for each transmission type, and how do the interquartile ranges compare?
summary = cdf.groupby('Transmission')['Price ($)'].describe() 
print("Summary statistics per transmission:") 
print(summary)
iqr = cdf.groupby('Transmission')['Price ($)'].apply(lambda x: x.quantile(0.75) - x.quantile(0.25)) 
print("\nInterquartile Range (IQR) per transmission:") 
print(iqr)

In [None]:
# 4. What is the distribution of car prices across different regions?
sns.boxplot(data=cdf, y='Price ($)', x='Dealer_Region',palette = 'coolwarm') 
plt.title("Distribution of Car Prices Across Regions") 
plt.xlabel("Region") 
plt.ylabel("Selling Price") 
plt.show()

In [None]:
# 5. What is the distribution of cars based on body styles?
distribution = cdf['Body Style'].value_counts(normalize = True)*100
print(distribution)

In [None]:
# 6.  How does the average selling price of cars vary by customer gender and annual income?
avg_prices = cdf.groupby(['Gender','Annual Income'])['Price ($)'].mean().reset_index()
print(avg_prices)

In [None]:
# 7. What is the distribution of car prices by region, and how does the number of cars sold vary by region?

print("Distribution of car prices by region=\n")
print(cdf.groupby('Dealer_Region')['Price ($)'].describe()) 
print("\nProportion of cars sold by region=\n")
print(cdf['Dealer_Region'].value_counts(normalize=True)*100)

In [None]:
# 8. How does the average car price differ between cars with different engine sizes?
sns.barplot(x='Engine', y='Price ($)', data=cdf, estimator='mean', palette='viridis') 
plt.title("Average Car Price by Engine Size") 
plt.xlabel("Engine") 
plt.ylabel("Average Selling Price") 
plt.show()

In [None]:
# 9. How do car prices vary based on the customer’s annual income bracket?

bins = [0, 10000, 20000, 40000, 100000, float('inf')]
labels = ["<10k", "10k-20k", "20k-40k", "40k-100k", ">100k"]
cdf["Income Bracket"] = pd.cut(cdf["Annual Income"], bins=bins, labels=labels)
price_by_income = cdf.groupby("Income Bracket")["Price ($)"].mean()
print(price_by_income)

In [None]:
#10.  What are the top 5 car models with the highest number of sales, and how does their price distribution look?
top_models = cdf["Model"].value_counts().head(5).index 
print("Top 5 Car Models by Sales:") 
print(top_models)  
df_top = cdf[cdf["Model"].isin(top_models)] 
price_distribution = df_top.groupby("Model")["Price ($)"].agg( ["count", "mean", "median", "std", "min", "max"] ).sort_values("count", ascending=False) 
print("\nPrice Distribution for Top 5 Models:") 
print(price_distribution)

In [None]:
# 11. How does car price vary with engine size across different car colors, and which colors have the highest price variation?
plt.figure(figsize=(10,6)) 
sns.violinplot(data=cdf, x="Engine", y="Price ($)",hue= "Color", inner="quartile") 
plt.title("Price Distribution by Car Color") 
plt.show()
# Black Color Of Overhead Camshaft Engine 

In [None]:
# 12. Is there any seasonal trend in car sales based on the date of sale?
cdf["Date"] = pd.to_datetime(cdf["Date"], errors="coerce")
cdf["Year"] = cdf["Date"].dt.year
cdf["Month"] = cdf["Date"].dt.month
cdf["Day"] = cdf["Date"].dt.day
monthly_sales = cdf.groupby(["Year","Month"]).size().reset_index(name="Sales")

plt.figure(figsize=(12,6))
sns.lineplot(data=monthly_sales, x="Month", y="Sales", hue="Year", marker="o")
plt.title("Monthly Car Sales Trend")
plt.xlabel("Month")
plt.ylabel("Number of Cars Sold")
plt.show()

In [None]:
# 13.  How does the car price distribution change when considering different combinations of body style and transmission type?

plt.figure(figsize=(12,6))
sns.violinplot(data=cdf, x="Body Style", y="Price ($)", hue="Transmission", split=True)
plt.title("Price Distribution by Body Style & Transmission")
plt.show()

In [None]:
# 14.  What is the correlation between car price, engine size, and annual income of customers, and how do these features interact?

cols = ["Price ($)", "Annual Income"]

corr = cdf[cols].corr()

plt.figure(figsize=(6,4))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation between Price and Annual Income")
plt.show()


In [None]:
# 15. How does the average car price vary across different car models and engine types?

avg_prices = cdf.groupby(["Model","Engine"])["Price ($)"].mean().reset_index()
print(avg_prices)

In [None]:
adf = pd.read_csv("amazon.csv")
adf.head()

In [None]:
adf.info()

In [None]:
# 1.  What is the average rating for each product category?
adf['rating'] = pd.to_numeric(adf['rating'], errors='coerce')
avg_rating = (adf.groupby('category', as_index=False)['rating'].mean().sort_values(by='rating', ascending=False)) 
print(avg_rating)

In [None]:
# 2. What are the top rating_count products by category?
adf['rating_count'] = adf['rating_count'].str.replace(',','').fillna(0).astype(int)
top_products_by_category = (adf.loc[adf.groupby('category')['rating_count'].idxmax(), 
           ['category', 'product_name', 'rating_count', 'rating']].sort_values(by='rating_count', ascending=False))
print(top_products_by_category.head(10)) 

In [None]:
# 3. What is the distribution of discounted prices vs. actual prices?

adf['actual_price'] = (adf['actual_price'].astype(str).str.replace('₹','', regex=True)
                       .str.replace(',','', regex=True).apply(pd.to_numeric, errors='coerce'))
adf['discounted_price'] = (adf['discounted_price'].astype(str).str.replace('₹','', regex=True)
                           .str.replace(',','', regex=True).apply(pd.to_numeric, errors='coerce'))

fig, axes = plt.subplots(1, 2, figsize=(14,6))

sns.histplot(adf['actual_price'], bins=50, kde=True, color='red', ax=axes[0])
axes[0].set_title("Distribution of Actual Prices")
axes[0].set_xlabel("Actual Price (₹)")
axes[0].set_ylabel("Frequency")

sns.histplot(adf['discounted_price'], bins=50, kde=True, color='blue', ax=axes[1])
axes[1].set_title("Distribution of Discounted Prices")
axes[1].set_xlabel("Discounted Price (₹)")
axes[1].set_ylabel("Frequency")

plt.tight_layout()
plt.show()



In [None]:
# 4.  How does the average discount percentage vary across categories?

adf['discount_percentage'] = (adf['discount_percentage'].astype(str).str.replace('%','', regex=True)
                              .apply(pd.to_numeric, errors='coerce'))
avg_discount_per_category = (adf.groupby('category')['discount_percentage'].mean().sort_values(ascending=False))

print(avg_discount_per_category)


In [None]:
# 5. What are the most popular product names?

most_popular_products = adf[['product_name','category','rating','rating_count']].sort_values(by='rating_count', ascending=False).head(10)
print(most_popular_products)


In [None]:
#6. What are the most popular product keywords?
from collections import Counter
adf['product_name'] = adf['product_name'].astype(str).str.lower()
all_words = []
for name in adf['product_name']:
    words = name.split()  
    all_words.extend(words) 
word_counts = Counter(all_words)
print(word_counts.most_common(20))

In [None]:
# 7. What are the most popular product reviews?
# Select relevant columns
popular_reviews = adf[['product_name','category','review_title','review_content','rating_count']].sort_values(by='rating_count', ascending=False).head(10)
print(popular_reviews)


In [None]:
# 8.  What is the correlation between discounted_price and rating?
correlation = adf['discounted_price'].corr(adf['rating'])
print("Correlation between discounted_price and rating:", correlation)

In [None]:
#9.  What are the Top 5 categories based on the highest ratings?
avg_rating_per_category = (adf.groupby('category')['rating'].mean().sort_values(ascending=False))
top5_categories = avg_rating_per_category.head(5) 
print(top5_categories)

In [None]:
# 1.  Read the dataframe, check null value if present then do the needful, check duplicate row , if present then do the needful.
sdf = pd.read_csv("spotify.csv")
sdf.head()

In [None]:
sdf.dtypes

In [None]:
sdf.isnull().sum()

In [None]:
sdf[sdf.duplicated()]

In [None]:
sdf = sdf.drop_duplicates()

In [None]:
# 2. What is the distribution of popularity among the tracks in the dataset? Visualize it using a histogram.

plt.figure(figsize=(10,6))
sns.histplot(sdf['Popularity'], bins=20, kde=True, color='purple', edgecolor='black')

plt.title("Distribution of Track Popularity", fontsize=16)
plt.xlabel("Popularity Score", fontsize=12)
plt.ylabel("Number of Tracks", fontsize=12)

plt.show()


In [None]:
# 3.  Is there any relationship between the popularity and the duration of tracks? Explore this using a scatter plot.

plt.figure(figsize=(13,8))

sns.scatterplot(x ='Popularity', y = 'Duration (ms)', data = sdf,marker = 'o', palette ='viridis')
plt.title("Relationship Between Popularity v/s Duration")
plt.xlabel("Duration", fontsize =10)
plt.ylabel("Popularity", fontsize= 10)

plt.tight_layout()
plt.show()
# duration doesn’t strongly determine popularity

In [None]:
# 4.  Which artist has the highest number of tracks in the dataset? Display the count of tracks for each artist using a countplot?

counts = sdf['Artist'].value_counts()

plt.figure(figsize=(12,20))

sns.countplot(y='Artist', data=sdf, order=counts.index, palette='coolwarm')

plt.title("Number of Tracks per Artist")
plt.xlabel("Counts")
plt.ylabel("Artist")
plt.yticks(rotation = 5, fontsize = 8)
plt.show()
# Drake

In [None]:
# 5. What are the top 5 least popular tracks in the dataset? Provide the artist name and track name for each.

least_popular = sdf.sort_values(by="Popularity", ascending=True).head(5)
print("Top 5 Least Popular Tracks:")
for idx, row in least_popular.iterrows():
    print(f"{row['Artist']} – {row['Track Name']} (Popularity: {row['Popularity']})")

In [None]:
# 6. Among the top 5 most popular artists, which artist has the highest popularity on average? 
# Calculate and display the average popularity for each artist

top_artists = sdf['Artist'].value_counts().head(5).index
top_df = sdf[sdf['Artist'].isin(top_artists)]
avg_popularity = top_df.groupby('Artist')['Popularity'].mean().sort_values(ascending=False)
print(avg_popularity)


In [None]:
# 7. For the top 5 most popular artists, what are their most popular tracks? List the track name for each artist.

top5_artists = (sdf.groupby("Artist")["Popularity"].max().sort_values(ascending=False).head(5).index)
most_popular_tracks = (sdf[sdf["Artist"].isin(top5_artists)].sort_values(["Artist", "Popularity"], ascending=[True, False])
                       .groupby("Artist").first()[["Track Name", "Popularity"]])
print("Most Popular Track for Each of the Top 5 Artists:")
print(most_popular_tracks)


In [None]:
#8. Visualize relationships between multiple numerical variables simultaneously using a pair plot.

num_cols = ["Popularity", "Duration (ms)"]
sns.pairplot(sdf[num_cols], diag_kind="hist", corner=True, plot_kws={'alpha':0.6, 'color':'teal'})
plt.suptitle("Pair Plot of Numerical Variables", y=1.02, fontsize=16)
plt.show()

In [None]:
# 9. Does the duration of tracks vary significantly across different artists? Explore this visually using a box plot or violin plot

top10_artists = sdf['Artist'].value_counts().head(10).index
df_top10 = sdf[sdf['Artist'].isin(top10_artists)]

plt.figure(figsize=(12,6))
sns.boxplot(x='Artist', y='Duration (ms)', data=df_top10, palette='Set2')
plt.xticks(rotation=45)
plt.title("Distribution of Track Duration by Artist (Box Plot)", fontsize=16)
plt.ylabel("Duration (ms)")
plt.show()

In [None]:
# 10.  How does the distribution of track popularity vary for different artists? Visualize this using a swarm plot or a violin plot.
plt.figure(figsize=(12,6))
sns.swarmplot(x='Artist', y='Popularity', data=df_top10, palette='Set3', size=4, alpha=0.7)
plt.xticks(rotation=45)
plt.title("Distribution of Track Popularity by Artist (Swarm Plot)", fontsize=16)
plt.ylabel("Popularity Score")
plt.show()