In [1]:
# plotly Library installation

!pip install plotly



In [2]:
# importing the necessary libraries 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

import plotly.express as px
import plotly.graph_objects as go

import datetime as dt
import calendar


In [3]:
# Reading the dataset
df =  pd.read_csv("D:/GITAM/BIO/Interns/Oasis Infobyte/Task--1/Datasets/Unemployment in India.csv")
df.head()

Unnamed: 0,Region,Date,Frequency,Estimated Unemployment Rate (%),Estimated Employed,Estimated Labour Participation Rate (%),Area
0,Andhra Pradesh,31-05-2019,Monthly,3.65,11999139.0,43.24,Rural
1,Andhra Pradesh,30-06-2019,Monthly,3.05,11755881.0,42.05,Rural
2,Andhra Pradesh,31-07-2019,Monthly,3.75,12086707.0,43.5,Rural
3,Andhra Pradesh,31-08-2019,Monthly,3.32,12285693.0,43.97,Rural
4,Andhra Pradesh,30-09-2019,Monthly,5.17,12256762.0,44.68,Rural


In [4]:
# Let's eliminate the spaces from the column names.

df.columns = df.columns.str.replace(' ', '')
df.columns.tolist()

['Region',
 'Date',
 'Frequency',
 'EstimatedUnemploymentRate(%)',
 'EstimatedEmployed',
 'EstimatedLabourParticipationRate(%)',
 'Area']

# Data Preprocessing

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 7 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Region                               740 non-null    object 
 1   Date                                 740 non-null    object 
 2   Frequency                            740 non-null    object 
 3   EstimatedUnemploymentRate(%)         740 non-null    float64
 4   EstimatedEmployed                    740 non-null    float64
 5   EstimatedLabourParticipationRate(%)  740 non-null    float64
 6   Area                                 740 non-null    object 
dtypes: float64(3), object(4)
memory usage: 42.1+ KB


In [6]:
df.describe()

Unnamed: 0,EstimatedUnemploymentRate(%),EstimatedEmployed,EstimatedLabourParticipationRate(%)
count,740.0,740.0,740.0
mean,11.787946,7204460.0,42.630122
std,10.721298,8087988.0,8.111094
min,0.0,49420.0,13.33
25%,4.6575,1190404.0,38.0625
50%,8.35,4744178.0,41.16
75%,15.8875,11275490.0,45.505
max,76.74,45777510.0,72.57


In [7]:
df.shape

(768, 7)

In [8]:
df.isnull().sum()

Region                                 28
Date                                   28
Frequency                              28
EstimatedUnemploymentRate(%)           28
EstimatedEmployed                      28
EstimatedLabourParticipationRate(%)    28
Area                                   28
dtype: int64

In [9]:
df.dropna(inplace=True)
df.isnull().sum()

Region                                 0
Date                                   0
Frequency                              0
EstimatedUnemploymentRate(%)           0
EstimatedEmployed                      0
EstimatedLabourParticipationRate(%)    0
Area                                   0
dtype: int64

In [10]:
df.columns.tolist()

['Region',
 'Date',
 'Frequency',
 'EstimatedUnemploymentRate(%)',
 'EstimatedEmployed',
 'EstimatedLabourParticipationRate(%)',
 'Area']

In [13]:
# creating a new column for month

df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
df['month_int'] = df['Date'].dt.month
df['month'] = df['month_int'].apply(lambda x: calendar.month_abbr[int(x)])
df.head()

Unnamed: 0,Region,Date,Frequency,EstimatedUnemploymentRate(%),EstimatedEmployed,EstimatedLabourParticipationRate(%),Area,month_int,month
0,Andhra Pradesh,2019-05-31,Monthly,3.65,11999139.0,43.24,Rural,5,May
1,Andhra Pradesh,2019-06-30,Monthly,3.05,11755881.0,42.05,Rural,6,Jun
2,Andhra Pradesh,2019-07-31,Monthly,3.75,12086707.0,43.5,Rural,7,Jul
3,Andhra Pradesh,2019-08-31,Monthly,3.32,12285693.0,43.97,Rural,8,Aug
4,Andhra Pradesh,2019-09-30,Monthly,5.17,12256762.0,44.68,Rural,9,Sep


In [14]:
# Retrieving data specifically from the dates that fall within the COVID time period.

start_date = pd.to_datetime('2020-01-01')
end_date = pd.to_datetime('2022-12-31')
covid_df = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]

In [15]:
# Retrieving data specifically from the dates that do not fall within the COVID time period.

start_date = pd.to_datetime('2016-01-01')
end_date = pd.to_datetime('2019-12-31')
non_covid_df = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]

In [16]:
# avg unemployement rate in covid-19
avg_unemployment_c = covid_df['EstimatedUnemploymentRate(%)'].mean()
avg_unemployment_c.round()

15.0

In [17]:
# Non-COVID average unemployment rate
avg_unemployment = non_covid_df['EstimatedUnemploymentRate(%)'].mean()
avg_unemployment.round()

9.0

In [18]:
# It can be observed that the unemployment rate is significantly higher during the COVID period compared to the usual baseline.
# During the COVID-19 period, the estimated unemployment rate is approximately 15%, whereas in the non-COVID period it stands at around 9%.

In [19]:
# Identify the months with the highest and lowest unemployment rates during Covid-19
max_month = covid_df.loc[covid_df['EstimatedUnemploymentRate(%)'].idxmax(), 'Date'].strftime('%B %Y')
min_month = covid_df.loc[covid_df['EstimatedUnemploymentRate(%)'].idxmin(), 'Date'].strftime('%B %Y')

print("Month with highest unemployment rate:",max_month)
print("Month with lowest unemployment rate:",min_month)

Month with highest unemployment rate: April 2020
Month with lowest unemployment rate: June 2020


# Visualizations

In [35]:
# Filter the data for the Covid-19 period
start_date = pd.to_datetime('2020-01-01')
end_date = pd.to_datetime('2022-12-31')
covid_df = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]

# Plot the unemployment rate over time during Covid-19
fig = px.line(covid_df, x='Date', y='EstimatedUnemploymentRate(%)',
              title='Unemployment Rate during Covid-19', color='Region',
              labels={'EstimatedUnemploymentRate(%)': 'Unemployment Rate (%)'},
              template='plotly_dark')  # Use a white background template

# Show the plot
fig.show()

# Based on the provided graph, it is evident that the region of Puducherry recorded the highest unemployment rate, reaching 76% on April 30, 2020.

In [21]:
#  Filter the data for the non-Covid period
start_date = pd.to_datetime('2016-01-01')
end_date = pd.to_datetime('2019-12-31')
non_covid_df = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]

# Plot the unemployment rate over time for the non-Covid period
fig = px.line(non_covid_df, x='Date', y='EstimatedUnemploymentRate(%)', color='Region',
              title='Unemployment Rate outside of Covid-19 by Region',
              labels={'EstimatedUnemploymentRate(%)': 'Unemployment Rate (%)'},
              template='plotly_dark')

fig.show()

# Based on the given graph, we can conclude that outside the COVID-19 period, Tripura had the highest unemployment rate, which was recorded at 34.69% on June 31, 2019.

In [22]:
# Group the data by state and calculate the average unemployment rate
state_avg_unemployment = non_covid_df.groupby('Region')['EstimatedUnemploymentRate(%)'].mean().reset_index()

# Sort the data by the average unemployment rate in descending order
state_avg_unemployment = state_avg_unemployment.sort_values('EstimatedUnemploymentRate(%)', ascending=False)



In [23]:
# Plot the state-wise average unemployment rate
fig = px.bar(state_avg_unemployment, x='Region', y='EstimatedUnemploymentRate(%)',
             title='State-wise Average Unemployment Rate (Non-Covid Period)',color='Region',
             labels={'EstimatedUnemploymentRate(%)': 'Unemployment Rate (%)'},
             template='plotly_dark')

fig.show()

# During the entire time period excluding the COVID-19 period, Tripura state had the highest unemployment rate.

In [38]:
# Group the data by state and calculate the average unemployment rate
state_avg_unemployment_c = covid_df.groupby('Region')['EstimatedUnemploymentRate(%)'].mean().reset_index()

# Sort the data by the average unemployment rate in descending order
state_avg_unemployment_c = state_avg_unemployment.sort_values('EstimatedUnemploymentRate(%)', ascending=False)

In [39]:
# Plot the state-wise average unemployment rate
fig = px.bar(state_avg_unemployment_c, x='Region', y='EstimatedUnemploymentRate(%)',
             title='State-wise Average Unemployment Rate (Covid Period)',color='Region',
             labels={'EstimatedUnemploymentRate(%)': 'Unemployment Rate (%)'},
             template='plotly_dark')

fig.show()

# Throughout the entire COVID-19 time period, Tripura experienced a high unemployment rate.

In [44]:

# The following are the top 5 regions with the highest unemployment rates.
# Each graph illustrates the unemployment rate specific to its corresponding state.

# Group the data by region and calculate the average unemployment rate
region_avg_unemployment_all_time = df.groupby('Region')['EstimatedUnemploymentRate(%)'].mean().reset_index()

# Sort the data by the average unemployment rate in descending order
top_5_regions = region_avg_unemployment_all_time.nlargest(5, 'EstimatedUnemploymentRate(%)')



In [45]:
# Iterate over the top 5 regions and plot individual graphs
for region in top_5_regions['Region']:
    region_data = df[df['Region'] == region]
    fig = px.line(region_data, x='Date', y='EstimatedUnemploymentRate(%)',
                  title=f'Unemployment Rate for {region}',
                  labels={'EstimatedUnemploymentRate(%)': 'Unemployment Rate (%)'},
                  template='plotly_dark')
    fig.show()

In [46]:

# Filter the data for rural and urban areas
rural_df = df[df['Area'] == 'Rural']
urban_df = df[df['Area'] == 'Urban']

# Group the data by region and calculate the first 5 average unemployment rates for rural and urban areas
rural_avg_unemployment = rural_df.groupby('Region')['EstimatedUnemploymentRate(%)'].apply(lambda x: x.head(5).mean()).reset_index()
urban_avg_unemployment = urban_df.groupby('Region')['EstimatedUnemploymentRate(%)'].apply(lambda x: x.head(5).mean()).reset_index()

# Merge the rural and urban average unemployment rates by region
merged_df = rural_avg_unemployment.merge(urban_avg_unemployment, on='Region', suffixes=('_rural', '_urban'))


# Sort the data by the average unemployment rate in descending order for rural or urban areas
sorted_df = merged_df.sort_values('EstimatedUnemploymentRate(%)_rural', ascending=False)



In [47]:
# Plot the comparison of unemployment rates for rural and urban areas
fig = px.bar(sorted_df, x='Region', y=['EstimatedUnemploymentRate(%)_rural', 'EstimatedUnemploymentRate(%)_urban'],
             barmode='group', title='Comparison of Unemployment Rates: Rural vs Urban',
             labels={'value': 'Unemployment Rate (%)', 'variable': 'Area'},
             template='plotly_dark')

fig.show()

In [48]:
# State wise analysis using boxplot

fig = px.box(df,x='Region',y='EstimatedUnemploymentRate(%)',color='Region',title='Unemployment rate',template='plotly_dark')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [31]:
# dataset with only data from 2022

df1 = pd.read_csv("D:/GITAM/BIO/Interns/Oasis Infobyte/Task--1/Datasets/Unemployment_Rate_upto_11_2020.csv")

In [32]:
df1.columns = df1.columns.str.replace(' ', '')

df1.columns.tolist()

['Region',
 'Date',
 'Frequency',
 'EstimatedUnemploymentRate(%)',
 'EstimatedEmployed',
 'EstimatedLabourParticipationRate(%)',
 'Region.1',
 'longitude',
 'latitude']

In [33]:
df1['Date'] = pd.to_datetime(df1['Date'], dayfirst=True)
df1['month_int'] = df1['Date'].dt.month
df1['month'] = df1['month_int'].apply(lambda x: calendar.month_abbr[int(x)])

df1.head()

Unnamed: 0,Region,Date,Frequency,EstimatedUnemploymentRate(%),EstimatedEmployed,EstimatedLabourParticipationRate(%),Region.1,longitude,latitude,month_int,month
0,Andhra Pradesh,2020-01-31,M,5.48,16635535,41.02,South,15.9129,79.74,1,Jan
1,Andhra Pradesh,2020-02-29,M,5.83,16545652,40.9,South,15.9129,79.74,2,Feb
2,Andhra Pradesh,2020-03-31,M,5.79,15881197,39.18,South,15.9129,79.74,3,Mar
3,Andhra Pradesh,2020-04-30,M,20.51,11336911,33.1,South,15.9129,79.74,4,Apr
4,Andhra Pradesh,2020-05-31,M,17.43,12988845,36.46,South,15.9129,79.74,5,May


In [34]:
fig = px.scatter_geo(df1,'longitude', 'latitude', color="Region",
                     hover_name="Region", size="EstimatedUnemploymentRate(%)",
                     animation_frame="month",scope='asia',title='Impack of lockdown on employement in India',template='plotly_dark')

fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 2000

fig.update_geos(lataxis_range=[5,40], lonaxis_range=[65, 100],oceancolor="lightblue",
    showocean=True)

fig.show()