# Unemployment in India Analysis
CodeAlpha — Data Science Internship


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

# If using Google Colab, uncomment the next lines to upload the CSV
# from google.colab import files
# uploaded = files.upload()

# Update filename if different
csv_path = 'Unemployment in India.csv'  # put this CSV in the same folder or upload in Colab
df = pd.read_csv(csv_path)

# Basic info
print('Dataset head:')
display(df.head())
print('\nData types and info:')
print(df.info())

# Clean column names
df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]
print('\nCleaned columns:', df.columns.tolist())

# Convert date column to datetime (adjust column name if different)
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])
else:
    for c in df.columns:
        if 'date' in c:
            df[c] = pd.to_datetime(df[c])

# Summary stats and null checks
print('\nNull values per column:')
print(df.isnull().sum())
print('\nSummary statistics:')
display(df.describe(include='all'))

# Basic EDA plots
plt.figure(figsize=(14,6))
ycol = None
for candidate in ['estimated_unemployment_rate_(%)', 'estimated_unemployment_rate', 'unemployment_rate', 'value']:
    if candidate in df.columns:
        ycol = candidate
        break
if ycol is None:
    raise ValueError('Could not find unemployment-rate column. Please check column names and update ycol variable.')

sns.lineplot(data=df, x='date', y=ycol)
plt.title('Unemployment Rate Over Time in India')
plt.xlabel('Date')
plt.ylabel('Unemployment Rate (%)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(14,6))
if 'region' in df.columns:
    sns.boxplot(data=df, x='region', y=ycol)
    plt.xticks(rotation=90)
    plt.title('Unemployment Rate by Region')
    plt.ylabel('Unemployment Rate (%)')
    plt.xlabel('Region')
    plt.tight_layout()
    plt.show()
else:
    print('Column "region" not found; skipping region boxplot')

# Covid period analysis
if 'date' in df.columns:
    covid_df = df[(df['date'] >= '2020-01-01') & (df['date'] <= '2021-12-31')]
    plt.figure(figsize=(14,6))
    if 'region' in df.columns:
        sns.lineplot(data=covid_df, x='date', y=ycol, hue='region')
        plt.title('Covid-19 Impact on Unemployment (2020–2021)')
        plt.xlabel('Date')
        plt.ylabel('Unemployment Rate (%)')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
    else:
        sns.lineplot(data=covid_df, x='date', y=ycol)
        plt.title('Covid-19 Impact on Unemployment (2020–2021)')
        plt.xlabel('Date')
        plt.ylabel('Unemployment Rate (%)')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

# Save a cleaned sample CSV for quick preview
df.to_csv('cleaned_unemployment_sample.csv', index=False)
print('\nSaved cleaned sample: cleaned_unemployment_sample.csv')


## Notes
- Put the original CSV file `Unemployment in India.csv` in the same folder before running locally or upload in Colab.
- The notebook auto-detects common unemployment column names; if your CSV uses a different name, update the `ycol` variable.
