In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
# Load the dataset (adjust the file path if needed)
df = pd.read_csv('/kaggle/input/unemployment-in-india/Unemployment in India.csv')

df.head()

Unnamed: 0,Region,Date,Frequency,Estimated Unemployment Rate (%),Estimated Employed,Estimated Labour Participation Rate (%),Area
0,Andhra Pradesh,31-05-2019,Monthly,3.65,11999139.0,43.24,Rural
1,Andhra Pradesh,30-06-2019,Monthly,3.05,11755881.0,42.05,Rural
2,Andhra Pradesh,31-07-2019,Monthly,3.75,12086707.0,43.5,Rural
3,Andhra Pradesh,31-08-2019,Monthly,3.32,12285693.0,43.97,Rural
4,Andhra Pradesh,30-09-2019,Monthly,5.17,12256762.0,44.68,Rural


In [3]:
df.columns

Index(['Region', ' Date', ' Frequency', ' Estimated Unemployment Rate (%)',
       ' Estimated Employed', ' Estimated Labour Participation Rate (%)',
       'Area'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 7 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Region                                    740 non-null    object 
 1    Date                                     740 non-null    object 
 2    Frequency                                740 non-null    object 
 3    Estimated Unemployment Rate (%)          740 non-null    float64
 4    Estimated Employed                       740 non-null    float64
 5    Estimated Labour Participation Rate (%)  740 non-null    float64
 6   Area                                      740 non-null    object 
dtypes: float64(3), object(4)
memory usage: 42.1+ KB


In [5]:
df.isnull().sum()

Region                                      28
 Date                                       28
 Frequency                                  28
 Estimated Unemployment Rate (%)            28
 Estimated Employed                         28
 Estimated Labour Participation Rate (%)    28
Area                                        28
dtype: int64

In [6]:
df = df.dropna()  

In [7]:
df = df.drop_duplicates()


In [8]:
# Optionally, rename columns to more concise names
df.rename(columns={
    'Region': 'Region',
    ' Date': 'Date',  # If you want to remove extra spaces from ' Date'
    ' Frequency': 'Frequency',
    ' Estimated Unemployment Rate (%)': 'Estimated Unemployment Rate (%)',
    ' Estimated Employed': 'Estimated Employed',
    ' Estimated Labour Participation Rate (%)': 'Estimated Labour Participation Rate (%)',
    'Area': 'Area'
}, inplace=True)

In [9]:
df.describe()


Unnamed: 0,Estimated Unemployment Rate (%),Estimated Employed,Estimated Labour Participation Rate (%)
count,740.0,740.0,740.0
mean,11.787946,7204460.0,42.630122
std,10.721298,8087988.0,8.111094
min,0.0,49420.0,13.33
25%,4.6575,1190404.0,38.0625
50%,8.35,4744178.0,41.16
75%,15.8875,11275490.0,45.505
max,76.74,45777510.0,72.57


# Distribution of Unemployment Rate

In [10]:
fig = px.histogram(df, x='Estimated Unemployment Rate (%)', 
                    nbins=20, title='Distribution of Unemployment Rate', 
                    template='plotly_dark')
fig.show()


# Unemployment Rate by Region

In [11]:
fig = px.box(df, x='Region', y='Estimated Unemployment Rate (%)', 
             title='Unemployment Rate by Region', 
             template='plotly_dark')
fig.show()


# Labour Participation Rate by Region

In [12]:
fig = px.box(df, x='Region', y='Estimated Labour Participation Rate (%)', 
             title='Labour Participation Rate by Region', 
             template='plotly_dark')
fig.show()


#  Area-wise Unemployment Rate Comparison

In [13]:
fig = px.bar(df.groupby('Area')['Estimated Unemployment Rate (%)'].mean().reset_index(), 
             x='Area', y='Estimated Unemployment Rate (%)', 
             title='Average Unemployment Rate by Area', template='plotly_dark')
fig.show()


# Top Regions with Highest Unemployment

In [14]:
top_regions = df.groupby('Region')['Estimated Unemployment Rate (%)'].mean().reset_index()
top_regions = top_regions.sort_values(by='Estimated Unemployment Rate (%)', ascending=False).head(10)
fig = px.bar(top_regions, x='Region', y='Estimated Unemployment Rate (%)', 
             title='Top 10 Regions with Highest Unemployment Rate', template='plotly_dark')
fig.show()


# Employment Trend in Rural vs Urban Areas

In [15]:
fig = px.area(df, x='Date', y='Estimated Employed', color='Area', 
              title='Employment Trend in Rural vs Urban Areas', template='plotly_dark')
fig.show()


# . Unemployment vs Employment for Different Areas

In [16]:
fig = px.scatter(df, x='Estimated Unemployment Rate (%)', y='Estimated Employed', color='Area', 
                 title='Unemployment vs Employment for Rural and Urban Areas', template='plotly_dark')
fig.show()


# Time-based Trends for Unemployment Rate in Top Regions

In [17]:
top_regions = df.groupby('Region')['Estimated Unemployment Rate (%)'].mean().reset_index()
top_regions = top_regions.sort_values(by='Estimated Unemployment Rate (%)', ascending=False).head(5)
top_regions_data = df[df['Region'].isin(top_regions['Region'])]
fig = px.line(top_regions_data, x='Date', y='Estimated Unemployment Rate (%)', color='Region', 
              title='Time-based Trends for Unemployment Rate in Top Regions', template='plotly_dark')
fig.show()


# Summary of the Three Plots

1. **Time-based Trends for Unemployment Rate in Top Regions**  
   This plot shows how the unemployment rates changed over time in five regions: Bihar, Haryana, Himachal Pradesh, Jharkhand, and Tripura. The rates fluctuate a lot, with a big rise in mid-2020, especially in Tripura, which has the highest unemployment rates.

2. **Employment Trend in Rural vs Urban Areas**  
   This chart compares employment in rural and urban areas. Rural areas always have more employed people than urban areas. Both areas see seasonal increases in employment, but rural areas have a much larger number of workers.

3. **Top 10 Regions with Highest Unemployment Rate**  
   This bar chart lists the regions with the highest unemployment rates. Tripura has the highest at around 28.35%, followed by Haryana and Jharkhand. Other regions with high unemployment include Himachal Pradesh, Bihar, and Delhi, with the rates decreasing as you go down the list.
