<a href="https://colab.research.google.com/github/kabeerbora/1980s_delin/blob/main/Panel_79_89_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
from tqdm.notebook import tqdm
import secrets
import string
from google.colab import drive
drive.mount('/content/drive')

## 1. Data Loading and Cleaning

### Sampling and Data Collection

| **Scheme Code**                 | **Description**                                                                                                                              | **Standard Category** | **Reference** |
|--------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------|------------------------|---------------|
| **Complete Enumeration**        | Units selected for a complete count, typically large or significant units.                                                                   | Census                 | [ASI 1990-91 Metadata](https://microdata.gov.in/NADA/index.php/catalog/41/study-description) |
| **100 or more workers**         | Units employing 100 or more workers, often included in the census sector.                                                                   | Census                 | [ASI 1990-91 Metadata](https://microdata.gov.in/NADA/index.php/catalog/41/study-description) |
| **Sample I**                    | A subset of units selected for sampling, possibly based on specific criteria.                                                                | Sample                 | [ASI 1987-88 Documentation](https://microdata.gov.in/nada43/index.php/catalog/38/download/383) |
| **Sample II**                   | Another subset of units selected for sampling, possibly based on different criteria than Sample I.                                           | Sample                 | [ASI 1987-88 Documentation](https://microdata.gov.in/nada43/index.php/catalog/38/download/383) |
| **B & C 100 or more workers**   | Units in categories B & C with 100 or more workers.                                                                                          | Census                 | [ASI 1987-88 Documentation](https://microdata.gov.in/nada43/index.php/catalog/38/download/383) |
| **B & C–CE**                    | B & C category units under Complete Enumeration.                                                                                             | Census                 | [ASI 1987-88 Documentation](https://microdata.gov.in/nada43/index.php/catalog/38/download/383) |
| **B & C Sample I**              | B & C category units under Sample I.                                                                                                         | Sample                 | [ASI 1987-88 Documentation](https://microdata.gov.in/nada43/index.php/catalog/38/download/383) |
| **B & C Sample II**             | B & C category units under Sample II.                                                                                                        | Sample                 | [ASI 1987-88 Documentation](https://microdata.gov.in/nada43/index.php/catalog/38/download/383) |
| **Electricity**                 | Units primarily engaged in electricity generation or distribution.                                                                           | Census                 | [ASI 1990-91 Metadata](https://microdata.gov.in/NADA/index.php/catalog/41/study-description) |
| **NR**                          | Not Reported or Not Recorded.                                                                                                                | Exclude                | [ASI 1990-91 Metadata](https://microdata.gov.in/NADA/index.php/catalog/41/study-description) |

In [None]:
data_file_path = r"/content/drive/MyDrive/Projects/Annual Survey of India/Data/Copy of 1976_1988_allfirms.csv"

df = pd.read_csv(data_file_path)

metric_columns = ["capital_open", "capital_closing", "work_cap_open", "work_cap_close", "outstanding_open", "outstanding_close", "semi_open", "semi_close"]

## 1. Cleaning and generating state code
df['state_code'] = df['state_code'].replace(['Daman and Diu'], 'DAMAN  &  DIU')
df['state_code'] = df['state_code'].replace(['Dadra & Nagar Haveli'], 'DADRA  AND  NAGAR  HAVELI')

unique_state_codes = df['state_code'].unique()
state_mapping = {code: i for i, code in enumerate(unique_state_codes)}
df['State'] = df['state_code'].map(state_mapping)

## 2. Filter census data
census_scheme_codes = map(lambda x: x.lower().strip(), [
    'Census', 'Complete Enumeration', '100 or more workers',
    'B & C 100 or more workers', 'B & C–CE', 'Electricity'
])
df = df[df['scheme_code'].str.lower().str.strip().isin(census_scheme_codes)]

## 3. Remove years
years_to_remove = ['1976_1977', '1977_1978','1989_1990', '1990_1991']
df = df[~df['year'].isin(years_to_remove)]

## 4. Get unique values from 'ownership_code' column
df['ownership_code'] = (
    df['ownership_code']
    .astype(str)
    .str.strip()
    .str.replace(r'\s+', ' ', regex=True)
    .str.lower()
)

ownership_code_mapping = {
    'wholly private enterprise': 1,
    'wholly private ownership': 1,
    'wholly central government': 2,
    'wholly state and/or local government': 3,
    'central government and state and/or local government jointly': 4,
    'central government and state and/or local government joint': 4,
    'joint sector public': 5,
    'joint sector private': 6,
    'invalid': 7,
    '7.0': 7
}

df['ownership_code_unique'] = df['ownership_code'].map(ownership_code_mapping)

unmatched = df[df['ownership_code_unique'].isna()]['ownership_code'].unique()
print("Unmatched ownership_code values:\n", unmatched)

## 5. Generate Identifier
df['identifier'] = df['nic_code'].astype(str).str[:4] + '_' + \
                   df['ownership_code_unique'].fillna('nan').astype(str) + '_' + \
                   df['State'].astype(str)

df = df.loc[(df['capital_open'] != 0) | (df['capital_closing'] != 0)].copy()

## 6. Convert all numerical columns to integer type
numeric_cols = df.select_dtypes(include='number').columns
df[numeric_cols] = df[numeric_cols].fillna(value=0).astype(int)

print(df.shape)
df = df.reset_index(drop=True)
df = df.reset_index(names=['ROW_ID'])

df.head(5)

## 2. Creating Panel

Desired Output


| firm_Id  | period     | capital         | work           | outstanding      | semi           | year_initial | identifier | source_ROW_IDs       |
|----------|------------|------------------|----------------|------------------|----------------|---------------|------------|-----------------------|
| ZHY8H7FJ | 1979-1983  | 100,200,300,400  | 250,650,100,150| 500,600,700,800  | 0,1,0,1        | 1957          | 2000_1_3   | 2004,2005,2006,2007   |
| AQW9T2KL | 1980-1984  | 150,250,350,450  | 200,300,400,500| 550,650,750,850  | 1,0,1,0        | 1957          | 2000_1_4   | 2010,2011,2012,2013   |
| MNB3X9CY | 1981-1985  | 120,220,320,420  | 260,360,460,560| 520,620,720,820  | 0,0,1,1        | 1957          | 2000_1_5   | 2020,2021,2022,2023   |
| TYU7P6RE | 1982-1986  | 130,230,330,430  | 270,370,470,570| 530,630,730,830  | 1,1,0,0        | 1957          | 2000_1_6   | 2030,2031,2032,2033   |


### 2.1 Extract all the years

In [None]:
years = sorted(df['year'].unique())
print(years)

### 2.2 Helper Functions

In [None]:
def standardise_dataframes(df, py_year, cy_year):

    ## 1. Extract data
    PY_df = df[df['year']==py_year].copy()
    CY_df = df[df['year']==cy_year].copy()

    ## 2. Standardize Current Year Dataframe
    CY_drop_columns = ['capital_closing', 'work_cap_close', 'outstanding_close', 'semi_close']
    CY_df = CY_df.drop(CY_drop_columns, axis=1)

    CY_rename_dict = {
        'capital_open': 'capital',
        'work_cap_open': 'work_cap',
        'outstanding_open': 'outstanding',
        'semi_open': 'semi'
    }
    CY_df = CY_df.rename(columns = CY_rename_dict)
    CY_df = CY_df.drop_duplicates(subset = ["year_initial", "capital", "work_cap", "outstanding", "semi"])

    ## 3. Standardize Previous Year Dataframe
    PY_drop_columns = ['capital_open', 'work_cap_open', 'outstanding_open', 'semi_open']
    PY_df = PY_df.drop(PY_drop_columns, axis=1)
    PY_rename_dict = {
        'capital_closing': 'capital',
        'work_cap_close': 'work_cap',
        'outstanding_close': 'outstanding',
        'semi_close': 'semi'
    }

    PY_df = PY_df.rename(columns = PY_rename_dict)
    PY_df = PY_df.drop_duplicates(subset = ["year_initial", "capital", "work_cap", "outstanding", "semi"])

    return CY_df, PY_df

### 2.3 Match Row IDs

Match row IDS by `opening` and `closing` values along with `identifier`, `year_initial` etc of consecutive years

In [None]:
df_clean = df.copy()
columns_to_drop = ['state_code','rsl', 'nic_code', 'ownership_code', 'organization_code', 'scheme_code', 'district_code', 'gross_sales', 'State','ownership_code_unique', 'tot_emoluments', 'bonus_workers', 'wages', 'tot_output', 'value_added', "persons_engaged"]
df_clean = df_clean.drop(columns_to_drop, axis=1).copy()
matched_row_ids = []

for i in range(len(years)-1):

    ## 1. Extract py and cy years
    py_year, cy_year = years[i], years[i+1]

    ## 2. Standardize
    CY_df, PY_df = standardise_dataframes(df_clean, py_year, cy_year)

    ## 3. Merge Dataframes
    df_out = pd.merge(
        left = CY_df,
        right = PY_df,
        on = ['year_initial', 'capital', 'work_cap', 'outstanding', 'semi', 'identifier'],
        suffixes=('_CY', '_PY'),
        how='inner'
    )

    matched_row_id = list(zip(df_out["ROW_ID_PY"],df_out["ROW_ID_CY"]))
    matched_row_ids.append(matched_row_id)


### 2.4 Merge Common Row IDS

- Now once the common row_ids are found, merge them into one single row
- That way we can get the unique firms

In [None]:
def merge_row_ids(ROW_IDS_PY, ROW_IDS_CY):

    a_dict = dict([(a[:-1], a[-1]) for a in ROW_IDS_PY])
    b_dict = dict([(b[:-1], b[-1]) for b in ROW_IDS_CY])
    merge_counts = 0

    C = []

    for a in ROW_IDS_PY:
        search_key = (a[-1],)
        if search_key in b_dict.keys():
            C.append(a+(b_dict[search_key],))
            merge_counts +=1
        else:
            C.append(a)

    unused_B = [b for b in ROW_IDS_CY if b[0] not in a_dict.values()]
    C.extend(unused_B)

    return C, merge_counts

In [None]:
%%time

for i in tqdm(range(len(matched_row_ids))):
    print(f"Iteration: {i}")
    if i == 0:
        ROW_IDS_PY = matched_row_ids[i]
        ROW_IDS_CY = matched_row_ids[i+1]
        PY_row_id = i
        CY_row_id = i+1
        print(f"PY matched row ids: {PY_row_id}")
    elif i == 1:
        ## Skip this iteration as it was already merged
        continue
    else:
        ROW_IDS_CY = matched_row_ids[i]
        CY_row_id = i

    ROW_IDS_PY, merge_counts = merge_row_ids(ROW_IDS_PY, ROW_IDS_CY)
    print(f"Total merge counts: {merge_counts}")

print("-"*100)
print(f"Total unique firms found: {len(ROW_IDS_PY)}")

### 2.5 Concat the results

In [None]:
%%time
rows_data = []
count = 0
for row_ids in tqdm(ROW_IDS_PY, unit="rows", total=len(ROW_IDS_PY)):

    x_row_df = df_clean[df_clean["ROW_ID"].isin(row_ids)].copy()

    data = {}

    ## 1. Extract the metadata values
    year_initial, year, identifier = x_row_df.iloc[0][['year_initial', 'year', 'identifier']]
    data['year_initial'] = int(year_initial)
    data['identifier'] = identifier
    data['row_ids'] = row_ids

    ## 2. Extract year range
    y = x_row_df[['year']].copy()
    y[['from', 'to']] = y['year'].str.split("_").tolist()
    data['year_from'] = y['from'].iloc[0]
    data['year_to'] = y['to'].iloc[-1]

    ## 3. Extract all the macro values
    data["capital"] = x_row_df["capital_open"].tolist() + [x_row_df["capital_closing"].tolist()[-1]]
    data["work_cap"] = x_row_df["work_cap_open"].tolist() + [x_row_df["work_cap_close"].tolist()[-1]]
    data["outstanding"] = x_row_df["outstanding_open"].tolist() + [x_row_df["outstanding_close"].tolist()[-1]]
    data["semi"] = x_row_df["semi_open"].tolist() + [x_row_df["semi_close"].tolist()[-1]]

    rows_data.append(data)

df_final = pd.DataFrame(rows_data)

In [None]:
df_final.head()