# TASK-1

 Exploratory Data Analysis (Univariate and Bivariate) 


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv("dataset.csv")

In [None]:
data

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data.select_dtypes(include=['object']).nunique()

In [None]:
data.select_dtypes(include=['object']).apply(lambda x: x.value_counts().head())

In [None]:
# Correlation matrix for numerical columns
data[['Postal Code', 'Model Year', 'Electric Range', 'Base MSRP', 'Legislative District', 'DOL Vehicle ID', '2020 Census Tract']].corr()


In [None]:
# Percentage of missing values
data[['Model', 'Legislative District', 'Vehicle Location', 'Electric Utility']].isnull().mean() * 100


In [None]:
# Unique values in specific categorical columns
data['Make'].unique()
data['Electric Vehicle Type'].unique()


In [None]:
sns.histplot(data['Model Year'],bins=20, color='blue')
plt.title('distribution of vehicle model years')
plt.xlabel('Model Year')
plt.ylabel('Frequency')
plt.show()

In [None]:
most_frequent_years = data['Model Year'].value_counts().head()
print("Most Frequent Model Years:\n", most_frequent_years)

In [None]:
print("Descriptive Statistics for Electric Range:\n", data['Electric Range'].describe())

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(data['Electric Range'] ,bins = 30,color ='green')
plt.title('Distribution of Electric Range')
plt.xlabel('Electric Range')
plt.ylabel('frequency')
sns.set(style='darkgrid')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(data['Electric Range'], color='green')
plt.title('Boxplot of Electric Range')
plt.xlabel('Electric Range (miles)')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(data['Base MSRP'], bins=30, kde=False, color='red')
plt.title('Distribution of Base MSRP')
plt.xlabel('Base MSRP (USD)')
plt.ylabel('Frequency')
plt.show()


In [None]:
plt.figure(figsize=(10,6))
data['Make'].value_counts().head(10).plot(kind='bar', color='purple')
plt.title('Top 10 Electric Vehicle Manufacturers')
plt.xlabel('Manufacturer')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
data['Model'].value_counts().head(10).plot(kind='bar', color='orange')
plt.title('Top 10 Electric Vehicle Models')
plt.xlabel('Model')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Scatter plot: Electric Range vs Base MSRP
plt.figure(figsize=(8,6))
sns.scatterplot(x='Electric Range', y='Base MSRP', data=data, color='blue')
plt.title('Electric Range vs Base MSRP')
plt.xlabel('Electric Range (miles)')
plt.ylabel('Base MSRP (USD)')
plt.show()


In [None]:

# Correlation between Electric Range and Base MSRP
correlation = data[['Electric Range', 'Base MSRP']].corr()
print("Correlation between Electric Range and Base MSRP:\n", correlation)
sns.heatmap(correlation, cmap='coolwarm')
plt.title('Electric Range and Base MSRP')
plt.show()


In [None]:
plt.figure(figsize=(8,6))
sns.lineplot(x='Model Year', y='Base MSRP', data=data, marker='o', color='green')
plt.title('Model Year vs Base MSRP (Line Plot)')
plt.xlabel('Model Year')
plt.ylabel('Base MSRP (USD)')
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(x='Make', y='Electric Range', data=data)
plt.xticks(rotation=90)
plt.title('Make vs Electric Range')
plt.xlabel('Make')
plt.ylabel('Electric Range (miles)')
plt.show()

In [None]:
make_vs_type = pd.crosstab(data['Make'], data['Electric Vehicle Type'])

# Visualize with a heatmap
plt.figure(figsize=(12,8))
sns.heatmap(make_vs_type, cmap="Blues", annot=True, fmt="d")
plt.title('Make vs Electric Vehicle Type')
plt.xlabel('Electric Vehicle Type')
plt.ylabel('Make')
plt.show()

In [None]:
# Handling missing values (example)
# For numerical columns: Use median for imputation
data['Legislative District'].fillna(data['Legislative District'].median(), inplace=True)

# For categorical columns: Fill with 'Unknown'
data['Vehicle Location'].fillna('Unknown', inplace=True)
data['Electric Utility'].fillna('Unknown', inplace=True)

# Verify missing values are handled
print("Missing Values After Imputation:\n", data.isnull().sum())



# Task - 2

In [None]:
import plotly.express as px

In [None]:
px.scatter(data, x= 'Model Year' , y='Electric Range')

In [None]:
 px.box(data, y="DOL Vehicle ID", title="Box Plot of DOL Vehicle ID", 
             labels={"DOL Vehicle ID": "DOL Vehicle ID"},
             width=800, height=500)


In [None]:
px.box(data, x="Make", y="DOL Vehicle ID", title="Bivariate Box Plot of DOL Vehicle ID by Make")

In [None]:
px.pie(data, names='Make', values='Base MSRP',
             title='Distribution of Base MSRP by Make')

In [None]:
px.choropleth(data,
                    locations='State',  
                    locationmode='USA-states',  
                    color='Electric Range', 
                    title='Electric Range by State')

In [None]:
 px.choropleth(data, locations='State', locationmode='USA-states', color='Base MSRP', animation_frame='Model Year', 
                              title='Base MSRP by State Over Model Year',
                              scope='usa',  
                              color_continuous_scale='Viridis')

# task - 03

In [None]:
import bar_chart_race as bcr

In [None]:
import bar_chart_race
print(bar_chart_race.__version__)


In [None]:
import bar_chart_race as bcr

# Assuming 'df' contains your EV data
ev_make_counts = data.groupby(['Model Year', 'Make']).size().unstack().fillna(0)

bcr.bar_chart_race(
    df=ev_make_counts,
    filename='ev_make_racing_bar.mp4',
    orientation='h',
    sort='desc',
    title='EV Make Count Over the Years',
    steps_per_period=50,
    period_length=2000,
    period_label={'x': .95, 'y': .15, 'ha': 'right', 'va': 'center', 'size': 72, 'weight': 'semibold'},
    bar_kwargs={'alpha': .99, 'lw': 0},
    period_fmt='{x:.0f}',
)


In [None]:
ev_sales_counts = data.groupby(['Model Year', 'Make']).size().unstack().fillna(0)

# Create the racing bar chart animation
bcr.bar_chart_race(
    df=ev_sales_counts,
    filename='year_wise_sales_make_race.mp4',  # Output video file name
    orientation='h',                              # Horizontal bars
    sort='desc',                                  # Sort bars in descending order
    n_bars=10,                                    # Show top 10 makes
    steps_per_period=50,                          # Frames per period (year)
    period_length=2000,                           # Duration of each period in milliseconds
    title='Year-wise Sales of EV Makes Over the year',
    bar_kwargs={'alpha': .99, 'lw': 0},          # Bar transparency and line width
    period_label={'x': .95, 'y': .15,            # Position and style of the period label
                  'ha': 'right', 'va': 'center',
                  'size': 72, 'weight': 'semibold'},
    period_fmt='{x:.0f}',                         # Format for the period label
)