In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import  warnings
warnings.filterwarnings("ignore")
 
# Load dataset
df = pd.read_csv('BIKE DETAILS.csv')  

# Display basic information
print("Dataset Overview:")
display(df.head())
print(df.info())
print("\nSummary Statistics:")
print(df.describe())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Visualizing missing values
plt.figure(figsize=(10,5))
sns.heatmap(df.isnull(), cmap='viridis', cbar=False, yticklabels=False)
plt.title("Missing Values Heatmap")
plt.show()

# Distribution of numerical features
numerical_features = ['selling_price', 'year', 'km_driven', 'ex_showroom_price']
df[numerical_features].hist(figsize=(12,8), bins=30)
plt.show()

# Correlation matrix
df_numeric = df.select_dtypes(include=[np.number])  # Select only numeric columns
plt.figure(figsize=(12,8))
sns.heatmap(df_numeric.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title("Feature Correlation")
plt.show()
# Boxplot for outlier detection
plt.figure(figsize=(12,6))
sns.boxplot(data=df[numerical_features])
plt.xticks(rotation=90)
plt.title("Boxplot of Numerical Features")
plt.show()

# Pairplot of dataset
sns.pairplot(df, vars=numerical_features, hue='seller_type')
plt.show()

# Checking unique values in categorical features
categorical_columns = ['name', 'seller_type', 'owner']
for col in categorical_columns:
    print(f"Unique values in {col}: {df[col].nunique()}")

# Value counts for categorical features
for col in categorical_columns:
    print(f"\nValue counts for {col}:")
    print(df[col].value_counts())



# 1. What is the range of selling prices in the dataset?
selling_price_range = df['selling_price'].max() - df['selling_price'].min()
print(f"1. Range of selling prices: {selling_price_range}")

# 2. What is the median selling price for bikes in the dataset?
median_selling_price = df['selling_price'].median()
print(f"2. Median selling price: {median_selling_price}")

# 3. What is the most common seller type?
most_common_seller = df['seller_type'].mode()[0]
print(f"3. Most common seller type: {most_common_seller}")

# 4. How many bikes have driven more than 50,000 kilometers?
high_km_bikes = df[df['km_driven'] > 50000].shape[0]
print(f"4. Number of bikes driven more than 50,000 km: {high_km_bikes}")

# 5. What is the average km_driven value for each ownership type?
avg_km_per_owner = df.groupby('owner')['km_driven'].mean()
print("5. Average km_driven per ownership type:")
print(avg_km_per_owner)

# 6. What proportion of bikes are from the year 2015 or older?
older_bikes_ratio = (df[df['year'] <= 2015].shape[0] / df.shape[0]) * 100
print(f"6. Proportion of bikes from 2015 or older: {older_bikes_ratio:.2f}%")

# 7. What is the trend of missing values across the dataset?
missing_values = df.isnull().sum()
print("7. Missing values per column:")
print(missing_values)

# 8. What is the highest ex_showroom_price recorded, and for which bike?
highest_ex_showroom_price = df.loc[df['ex_showroom_price'].idxmax(), ['name', 'ex_showroom_price']]
print(f"8. Highest ex-showroom price: {highest_ex_showroom_price}")

# 9. What is the total number of bikes listed by each seller type?
bikes_per_seller = df['seller_type'].value_counts()
print("9. Total bikes listed per seller type:")
print(bikes_per_seller)

# 10. What is the relationship between selling_price and km_driven for first-owner bikes?
plt.figure(figsize=(8,5))
sns.scatterplot(data=df[df['owner'] == '1st owner'], x='km_driven', y='selling_price', alpha=0.5)
plt.title("10. Selling Price vs. Km Driven for First Owner Bikes")
plt.xlabel("Km Driven")
plt.ylabel("Selling Price")
plt.show()

# 11. Identify and remove outliers in the km_driven column using the IQR method
Q1 = df['km_driven'].quantile(0.25)
Q3 = df['km_driven'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_cleaned = df[(df['km_driven'] >= lower_bound) & (df['km_driven'] <= upper_bound)]
print("11. Outliers in km_driven removed.")

# 12. Perform a bivariate analysis to visualize the relationship between year and selling_price
plt.figure(figsize=(8,5))
sns.boxplot(data=df, x='year', y='selling_price')
plt.xticks(rotation=90)
plt.title("12. Selling Price vs. Year")
plt.xlabel("Year")
plt.ylabel("Selling Price")
plt.show()

# 13. What is the average depreciation in selling price based on the bike's age?
current_year = 2025  # Update as per current year
df['bike_age'] = current_year - df['year']
avg_depreciation = df.groupby('bike_age')['selling_price'].mean()
print("13. Average depreciation in selling price per bike age:")
print(avg_depreciation)

# 14. Which bike names are priced significantly above the average price for their manufacturing year?
price_above_avg = df.groupby('year')['selling_price'].mean()
df_above_avg = df[df.apply(lambda x: x['selling_price'] > price_above_avg[x['year']], axis=1)]
print("14. Bikes priced significantly above average for their year:")
print(df_above_avg[['name', 'year', 'selling_price']])

# 15. Develop a correlation matrix for numeric columns and visualize it using a heatmap
df_numeric = df.select_dtypes(include=['number'])  

corr_matrix = df_numeric.corr()


plt.figure(figsize=(10, 6))  
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)

plt.title("Correlation Matrix Heatmap")
plt.show()



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import  warnings
warnings.filterwarnings("ignore")
# Load dataset
df = pd.read_csv("Car_sale.csv")
# Display basic information
print("Dataset Overview:")
display(df.head())
print(df.info())
print("\nSummary Statistics:")
print(df.describe())

# 1. Average selling price per dealer
dealer_avg_price = df.groupby('Dealer_Name')['Price ($)'].mean().sort_values(ascending=False)
print("1. Average selling price per dealer:")
print(dealer_avg_price)

# 2. Car brand with highest price variation
brand_price_variation = df.groupby('Company')['Price ($)'].std().sort_values(ascending=False)
print("2. Car brand with highest price variation:")
print(brand_price_variation)

# 3. Distribution of car prices for each transmission type
plt.figure(figsize=(8,5))
sns.boxplot(data=df, x='Transmission', y='Price ($)')
plt.title("3. Car Prices by Transmission Type")
plt.show()

# 4. Distribution of car prices across different regions
plt.figure(figsize=(8,5))
sns.boxplot(data=df, x='Dealer_Region', y='Price ($)')
plt.title("4. Car Prices by Region")
plt.show()

# 5. Distribution of cars based on body styles
plt.figure(figsize=(8,5))
df['Body Style'].value_counts().plot(kind='bar')
plt.title("5. Car Distribution by Body Style")
plt.show()

# 6. Average car price by gender and income
gender_income_price = df.groupby(['Gender', 'Annual Income'])['Price ($)'].mean()
print("6. Average car price by gender and annual income:")
print(gender_income_price)

# 7. Distribution of car prices by region and sales count
region_sales = df.groupby('Dealer_Region').agg({'Price ($)': 'mean', 'Car_id': 'count'})
print("7. Car price distribution & sales count by region:")
print(region_sales)

# 8. Average car price by engine size
engine_price = df.groupby('Engine')['Price ($)'].mean()
print("8. Average car price by engine size:")
print(engine_price)

# 9. Car prices by income bracket
income_brackets = pd.qcut(df['Annual Income'], q=4)
income_price = df.groupby(income_brackets)['Price ($)'].mean()
print("9. Car price variation by income bracket:")
print(income_price)

# 10. Top 5 car models by sales count
model_sales = df['Model'].value_counts().head(5)
print("10. Top 5 car models by sales count:")
print(model_sales)

# 11. Car price variation by color and engine size
plt.figure(figsize=(10,6))
sns.boxplot(data=df, x='Color', y='Price ($)', hue='Engine')
plt.xticks(rotation=90)
plt.title("11. Car Prices by Color and Engine Size")
plt.show()

# 12. Seasonal trend in car sales
df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month
monthly_sales = df.groupby('Month')['Car_id'].count()
print("12. Monthly car sales trend:")
print(monthly_sales)

# 13. Car price by body style & transmission
grid = sns.FacetGrid(df, col='Body Style', hue='Transmission', height=4, aspect=1.2)
grid.map_dataframe(sns.boxplot, x='Transmission', y='Price ($)', dodge=False)  
grid.add_legend()
plt.show()


# 14. Correlation between price, engine size & income
for col in ['Price ($)', 'Engine', 'Annual Income']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Remove NaNs if necessary
df.dropna(subset=['Price ($)', 'Engine', 'Annual Income'], inplace=True)

# Generate heatmap
plt.figure(figsize=(8,6))
sns.heatmap(df[['Price ($)', 'Engine', 'Annual Income']].corr(), annot=True, cmap='coolwarm')
plt.show()

# 15. Average car price by model & engine type
model_engine_price = df.groupby(['Model', 'Engine'])['Price ($)'].mean()
print("15. Average car price by model and engine type:")
print(model_engine_price)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import  warnings
warnings.filterwarnings("ignore")
# Load dataset
df = pd.read_csv('amazon.csv')  

# Display basic information
print("Dataset Overview:")
display(df.head())
print(df.info())
print("\nSummary Statistics:")
print(df.describe())

# 1. Average rating per product category
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')  # Convert to numeric

avg_rating_category = df.groupby('category')['rating'].mean().sort_values(ascending=False)
print(avg_rating_category)

# 2. Top rating_count products by category
df['rating_count'] = pd.to_numeric(df['rating_count'], errors='coerce')

# Get top 3 products by rating count for each category
top_products = df.groupby('category').apply(lambda x: x.nlargest(3, 'rating_count'))

# Display result
print(top_products)

# 3. Distribution of discounted vs. actual prices
plt.figure(figsize=(8,5))
sns.histplot(df['discounted_price'], bins=30, label='Discounted Price', kde=True)
sns.histplot(df['actual_price'], bins=30, label='Actual Price', kde=True, color='red')
plt.legend()
plt.title("3. Price Distribution: Discounted vs. Actual")
plt.show()

# 4. Average discount percentage by category
df['discount_percentage'] = pd.to_numeric(df['discount_percentage'], errors='coerce')

# Compute average discount per category
avg_discount_category = df.groupby('category')['discount_percentage'].mean().sort_values(ascending=False)

# Display results
print(avg_discount_category)
# 5. Most popular product names
df['rating_count'] = pd.to_numeric(df['rating_count'], errors='coerce')

# Get the top 5 most popular products based on rating count
most_popular_products = df[['product_name', 'rating_count']].groupby('product_name').sum().nlargest(5, 'rating_count')

# Display result
print(most_popular_products)

# 6. Most popular product keywords
keywords = ' '.join(df['about_product'].dropna()).split()
keyword_counts = pd.Series(keywords).value_counts().head(10)
print("6. Most popular product keywords:")
print(keyword_counts)

# 7. Most popular product reviews
popular_reviews = df['review_title'].value_counts().head(10)
print("7. Most popular product reviews:")
print(popular_reviews)

# 8. Correlation between discounted_price and rating
plt.figure(figsize=(8,5))
sns.scatterplot(data=df, x='discounted_price', y='rating', alpha=0.5)
plt.title("8. Discounted Price vs. Rating")
plt.xlabel("Discounted Price")
plt.ylabel("Rating")
plt.show()

# 9. Top 5 categories with highest ratings
top_categories = df.groupby('category')['rating'].mean().nlargest(5)
print("9. Top 5 categories with highest ratings:")
print(top_categories)

# 10. Identifying areas for improvement
missing_values = df.isnull().sum()
print("10. Potential areas for improvement:")
print("- Missing data in columns:")
print(missing_values[missing_values > 0])
print("- Consider analyzing review sentiment for deeper insights.")
print("- Check if highly rated products align with top-selling ones.")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import  warnings
warnings.filterwarnings("ignore")
# Load dataset
df = pd.read_csv('spotify.csv')   

# 1. Check for null values and duplicates
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
print("1. Data cleaned: Removed duplicates and handled missing values.")

# 2. Distribution of popularity
plt.figure(figsize=(8,5))
sns.histplot(df['Popularity'], bins=20, kde=True, color='blue')
plt.title("2. Distribution of Track Popularity")
plt.xlabel("Popularity")
plt.ylabel("Count")
plt.show()

# 3. Relationship between popularity and duration
plt.figure(figsize=(8,5))
sns.scatterplot(x=df['Duration (ms)'], y=df['Popularity'], alpha=0.5)
plt.title("3. Popularity vs. Duration of Tracks")
plt.xlabel("Duration (ms)")
plt.ylabel("Popularity")
plt.show()

# 4. Artist with the highest number of tracks
plt.figure(figsize=(12,6))
sns.countplot(y=df['Artist'], order=df['Artist'].value_counts().index[:10])
plt.title("4. Top 10 Artists with the Most Tracks")
plt.xlabel("Count of Tracks")
plt.ylabel("Artist")
plt.show()

# 5. Top 5 least popular tracks
least_popular = df.nsmallest(5, 'Popularity')[['Artist', 'Track Name', 'Popularity']]
print("5. Top 5 Least Popular Tracks:")
print(least_popular)

# 6. Average popularity of top 5 most popular artists
top_artists = df.groupby('Artist')['Popularity'].mean().nlargest(5)
print("6. Average Popularity of Top 5 Artists:")
print(top_artists)

# 7. Most popular track for each of the top 5 artists
top_tracks = df.loc[df.groupby('Artist')['Popularity'].idxmax()][['Artist', 'Track Name', 'Popularity']]
top_tracks = top_tracks[top_tracks['Artist'].isin(top_artists.index)]
print("7. Most Popular Tracks of Top 5 Artists:")
print(top_tracks)

# 8. Pair plot of numerical variables
sns.pairplot(df[['Popularity', 'Duration (ms)']], palette= "Set1")
plt.show()

# 9. Duration variation across artists
plt.figure(figsize=(12,6))
sns.boxplot(x='Artist', y='Duration (ms)', data=df[df['Artist'].isin(top_artists.index)], palette= "Set2")
plt.xticks(rotation=90)
plt.title("9. Duration Variation Across Artists")
plt.show()

# 10. Popularity distribution across different artists
plt.figure(figsize=(12,6))
sns.violinplot(x='Artist', y='Popularity', data=df[df['Artist'].isin(top_artists.index)], palette= "dark")
plt.xticks(rotation=90)
plt.title("10. Popularity Distribution Across Artists")
plt.show()
