# Imports

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Constants

In [3]:
PROCESSED_FILE_DIRECTORY = r"..\data\excel"

REPORT_FILE_DIRECTORY = r"..\data\report"

# Get Files

In [4]:
files = os.listdir(PROCESSED_FILE_DIRECTORY)

files = [f for f in files if os.path.isfile(PROCESSED_FILE_DIRECTORY + '/' + f)]

# Data Processing

## Sheet 0

In [5]:
def get_empty_dataframe(file):
    data_df = pd.DataFrame({'2022-23': [0],
                                '2021-22': [0],
                                '2020-21': [0],
                                '2019-20': [0],
                                '2018-19': [0],
                                '2017-18': [0],
                                'Institution': file.split('.')[0]
                                })
    return data_df

In [6]:
def process_sheet0():
    nirf_df = pd.DataFrame()
    
    try:
        for file in files:
            if "0" in pd.ExcelFile(PROCESSED_FILE_DIRECTORY + '\\' + file).sheet_names:            
                #print(f"Processing the sheet 0 of the file: {file}")

                excel_file = PROCESSED_FILE_DIRECTORY + '\\' + file
                data_df = pd.read_excel(excel_file, sheet_name="0")
                
                data_df = data_df.iloc[:, 1:]
                data_df.replace('-', 0, inplace=True)  
                
                data_df['2022-23'] = data_df['2022-23'].astype(np.int64)
                data_df['2021-22'] = data_df['2021-22'].astype(np.int64)
                data_df['2020-21'] = data_df['2020-21'].astype(np.int64)                                                             
                data_df['2019-20'] = data_df['2019-20'].astype(np.int64)
                data_df['2018-19'] = data_df['2018-19'].astype(np.int64)
                data_df['2017-18'] = data_df['2017-18'].astype(np.int64)

                data_df['Total'] = data_df['2022-23'] + data_df['2021-22'] + data_df['2020-21'] + data_df['2019-20'] + \
                data_df['2018-19'] + data_df['2017-18']   
            else:
                data_df = get_empty_dataframe(file)
            
            data_df.fillna(0, inplace=True)
            nirf_df = pd.concat([nirf_df, data_df])           
    except Exception as ex:
        print(f"Error while processing the sheet 0 of the file: {file}")
        print(str(ex))
        nirf_df = get_empty_dataframe(file)
    finally:
        nirf_df = nirf_df.groupby('Institution').sum('Total')[['Total']]
        nirf_df = nirf_df.transpose()
        nirf_df.rename(index={'Total': 'Total sanctioned approved intake for all programs'}, inplace=True)    

    return nirf_df

In [7]:
process_sheet0()

Institution,1 IR-O-U-0456,10 IR-O-U-0109,100 IR-O-U-0470,11 IR-O-U-0500,12 IR-O-U-0013,13 IR-O-U-0108,14 IR-O-U-0234,15 IR-O-U-0120,16 IR-O-U-0496,17 IR-O-U-0575,...,88 IR-O-U-0523,89 IR-O-U-0190,9 IR-O-U-0053,91 IR-O-U-0446,92 IR-O-U-0055,93 IR-O-U-0577,94 IR-O-U-0136,95 IR-O-U-0642,97 IR-O-U-0003,99 IR-O-U-0686
Total sanctioned approved intake for all programs,7038,5118,2700,29439,3008,8646,26488,23676,19380,11548,...,2859,4121,5635,20629,4306,4604,3540,2595,5240,1273


### Sheet 1

In [8]:
def get_empty_dataframe(file):
        data_df = pd.DataFrame({'Total Students': [0], 'Institution': file.split('.')[0]})
        
        return data_df

In [9]:
def process_sheet1():
    nirf_df = pd.DataFrame()
    
    try:
        for file in files:
            if "1" in pd.ExcelFile(PROCESSED_FILE_DIRECTORY + '\\' + file).sheet_names:
                #print(f"Processing the sheet 1 of the file: {file}")
                excel_file = PROCESSED_FILE_DIRECTORY + '\\' + file
                data_df = pd.read_excel(excel_file, sheet_name="1")

                data_df = data_df[['Total Students', 'Institution']]
            else:
                data_df = get_empty_dataframe(file)
                
            data_df.fillna(0, inplace=True)        
            nirf_df = pd.concat([nirf_df, data_df])                        
    except Exception as ex:
        print(f"Error while processing the sheet 1 of the file: {file}")
        print(str(ex))
        nirf_df = get_empty_dataframe(file) 
    finally:                
        nirf_df = nirf_df.groupby('Institution').sum('Total Students')
        nirf_df = nirf_df.transpose()
        nirf_df.rename(index={'Total Students': 'Total number of students enrolled in all programs'}, inplace=True)        
    
    return nirf_df

In [10]:
process_sheet1()

Institution,1 IR-O-U-0456,10 IR-O-U-0109,100 IR-O-U-0470,11 IR-O-U-0500,12 IR-O-U-0013,13 IR-O-U-0108,14 IR-O-U-0234,15 IR-O-U-0120,16 IR-O-U-0496,17 IR-O-U-0575,...,88 IR-O-U-0523,89 IR-O-U-0190,9 IR-O-U-0053,91 IR-O-U-0446,92 IR-O-U-0055,93 IR-O-U-0577,94 IR-O-U-0136,95 IR-O-U-0642,97 IR-O-U-0003,99 IR-O-U-0686
Total number of students enrolled in all programs,7397,5199,2245,29466,2788,8730,24376,22894,18782,10394,...,2574,3851,5678,19448,4040,4165,3537,2745,5227,1192


In [11]:
def process_sheet1_1():
    nirf_df = pd.DataFrame()
    
    try:
        for file in files:
            if "1" in pd.ExcelFile(PROCESSED_FILE_DIRECTORY + '\\' + file).sheet_names:
                #print(f"Processing the sheet 1 of the file: {file}")
                excel_file = PROCESSED_FILE_DIRECTORY + '\\' + file
                data_df = pd.read_excel(excel_file, sheet_name="1")

                #data_df = data_df.iloc[:, [7, 8, 13]]
                
                data_df['Total Students'] = data_df.iloc[:, 7] + data_df.iloc[:, 8]
            else:
                data_df = get_empty_dataframe(file)
                
            data_df.fillna(0, inplace=True)        
            nirf_df = pd.concat([nirf_df, data_df])                        
    except Exception as ex:
        print(f"Error while processing the sheet 1 of the file: {file}")
        print(str(ex))
        nirf_df = get_empty_dataframe(file) 
    finally:                
        nirf_df = nirf_df.groupby('Institution').sum('Total Students')
        nirf_df = nirf_df.transpose()
        nirf_df = nirf_df.iloc[[-1], :]
        nirf_df.rename(index={'Total Students': 'Total number of economically socially challenged \
        students enrolled in all programs'}, inplace=True)        
    
    return nirf_df

In [12]:
process_sheet1_1()

Institution,1 IR-O-U-0456,10 IR-O-U-0109,100 IR-O-U-0470,11 IR-O-U-0500,12 IR-O-U-0013,13 IR-O-U-0108,14 IR-O-U-0234,15 IR-O-U-0120,16 IR-O-U-0496,17 IR-O-U-0575,...,88 IR-O-U-0523,89 IR-O-U-0190,9 IR-O-U-0053,91 IR-O-U-0446,92 IR-O-U-0055,93 IR-O-U-0577,94 IR-O-U-0136,95 IR-O-U-0642,97 IR-O-U-0003,99 IR-O-U-0686
No. of students\rwho are not\rreceiving full\rtuition fee\rreimbursement,0.0,3463.0,154.0,10523.0,695.0,29.0,1478.0,10450.0,4171.0,166.0,...,120.0,880.0,1464.0,5897.0,512.0,21.0,0.0,59.0,157.0,413.0


In [13]:
def process_sheet1_2():
    nirf_df = pd.DataFrame()
    
    try:
        for file in files:
            if "1" in pd.ExcelFile(PROCESSED_FILE_DIRECTORY + '\\' + file).sheet_names:
                #print(f"Processing the sheet 1 of the file: {file}")
                excel_file = PROCESSED_FILE_DIRECTORY + '\\' + file
                data_df = pd.read_excel(excel_file, sheet_name="1")

                data_df = data_df.iloc[:, [9, 10, 11, 13]]
            else:
                data_df = get_empty_dataframe(file)
                
            data_df.fillna(0, inplace=True)        
            nirf_df = pd.concat([nirf_df, data_df])                        
    except Exception as ex:
        print(f"Error while processing the sheet 1 of the file: {file}")
        print(str(ex))
        nirf_df = get_empty_dataframe(file) 
    finally:                
        nirf_df = nirf_df.groupby('Institution').sum()
        nirf_df = nirf_df.transpose()
        #nirf_df.rename(index={'Total Students': 'Total number of economically socially challenged \
        #students enrolled in all programs'}, inplace=True)        
    
    return nirf_df

In [14]:
process_sheet1_2()

Institution,1 IR-O-U-0456,10 IR-O-U-0109,100 IR-O-U-0470,11 IR-O-U-0500,12 IR-O-U-0013,13 IR-O-U-0108,14 IR-O-U-0234,15 IR-O-U-0120,16 IR-O-U-0496,17 IR-O-U-0575,...,88 IR-O-U-0523,89 IR-O-U-0190,9 IR-O-U-0053,91 IR-O-U-0446,92 IR-O-U-0055,93 IR-O-U-0577,94 IR-O-U-0136,95 IR-O-U-0642,97 IR-O-U-0003,99 IR-O-U-0686
No. of students_x000D_receiving full_x000D_tuition fee_x000D_reimbursement_x000D_from the State_x000D_and Central_x000D_Government,308.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
No. of students_x000D_receiving full_x000D_tuition fee_x000D_reimbursement_x000D_from Institution_x000D_Funds,4066.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
No. of students_x000D_receiving full_x000D_tuition fee_x000D_reimbursement_x000D_from the Private_x000D_Bodies,527.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
No. of students\rreceiving full\rtuition fee\rreimbursement\rfrom the State\rand Central\rGovernment,0.0,0.0,1773.0,7395.0,129.0,5207.0,123.0,331.0,3873.0,5436.0,...,1402.0,69.0,1756.0,580.0,457.0,323.0,2144.0,0.0,5070.0,130.0
No. of students\rreceiving full\rtuition fee\rreimbursement\rfrom Institution\rFunds,0.0,262.0,183.0,15.0,676.0,1901.0,1450.0,1382.0,855.0,1453.0,...,0.0,60.0,0.0,1635.0,1489.0,2355.0,16.0,289.0,0.0,0.0
No. of students\rreceiving full\rtuition fee\rreimbursement\rfrom the Private\rBodies,0.0,0.0,90.0,153.0,2.0,1005.0,0.0,1.0,2862.0,335.0,...,0.0,12.0,0.0,758.0,14.0,12.0,0.0,42.0,0.0,90.0


## Sheet 2

In [15]:
def get_empty_dataframe(file):
    data_df = pd.DataFrame({'Median salary': [0], 'Institution': file.split('.')[0]})
    
    return data_df

In [16]:
def process_sheet2():
    nirf_df = pd.DataFrame()
    
    try:
        for file in files:
            try:
                if "2" in pd.ExcelFile(PROCESSED_FILE_DIRECTORY + '\\' + file).sheet_names:
                    #print(f"Processing the sheet 2 of the file: {file}")
                    excel_file = PROCESSED_FILE_DIRECTORY + '\\' + file
                    data_df = pd.read_excel(excel_file, sheet_name="2")
                    
                    columns = data_df.columns.to_list()
                    median_index = 0
                    institute_index = 0
                    
                    for index, element in enumerate(columns):
                        if element.split()[0] == 'Median':
                            median_index = index                            
                        elif element.split()[0] == 'Institution':
                            institute_index = index

                    data_df = pd.DataFrame({
                        'Median salary': data_df.iloc[:, median_index].values,
                        'Institution': data_df.iloc[:, institute_index].values
                    })

                    data_df['Median salary'] = data_df['Median salary'].str.split(r"(").str[0].astype(np.float64)
                else:
                    data_df = get_empty_dataframe(file)
            except Exception as ex:
                data_df = get_empty_dataframe(file)
            finally:  
                data_df.fillna(0, inplace=True)
                nirf_df = pd.concat([nirf_df, data_df])           
    except Exception as ex:
        print(f"Error while processing the sheet 2 of the file: {file}")
        print(str(ex))
        nirf_df = get_empty_dataframe(file)
    finally:            
        nirf_df = nirf_df.groupby('Institution').mean('Median salary')
        nirf_df = nirf_df.transpose()           
    
    return nirf_df

In [17]:
process_sheet2()

Institution,1 IR-O-U-0456,10 IR-O-U-0109,100 IR-O-U-0470,11 IR-O-U-0500,12 IR-O-U-0013,13 IR-O-U-0108,14 IR-O-U-0234,15 IR-O-U-0120,16 IR-O-U-0496,17 IR-O-U-0575,...,88 IR-O-U-0523,89 IR-O-U-0190,9 IR-O-U-0053,91 IR-O-U-0446,92 IR-O-U-0055,93 IR-O-U-0577,94 IR-O-U-0136,95 IR-O-U-0642,97 IR-O-U-0003,99 IR-O-U-0686
Median salary,1621147.0,666666.666667,246934.666667,750000.0,1908333.0,510000.0,333333.333333,500000.0,490262.0,578333.333333,...,966666.666667,249666.666667,2093333.0,398333.333333,1081667.0,953333.333333,53058.823529,870333.333333,766666.666667,820000.0


## Sheet 4

In [18]:
def get_empty_dataframe(file):
    data_df = pd.DataFrame({'Total': [0], 'Institution': file.split('.')[0]})
    
    return data_df

In [19]:
def process_sheet4():
    nirf_df = pd.DataFrame()
    
    try:    
        for file in files:
            try:
                if "7" in pd.ExcelFile(PROCESSED_FILE_DIRECTORY + '\\' + file).sheet_names:
                    #print(f"Processing the sheet 4 of the file: {file}")

                    excel_file = PROCESSED_FILE_DIRECTORY + '\\' + file
                    data_df = pd.read_excel(excel_file, sheet_name="7")

                    data_df.rename(columns={'Unnamed: 0': 'Total'}, inplace=True)
                    data_df = data_df.iloc[1:3, [1, 4]]
                    data_df['Total'] = data_df['Total'].astype(np.int64) 
                else:
                    data_df = get_empty_dataframe(file)
            except Exception as ex:
                print(f"Error while processing the sheet 4 of the file: {file}")
                print(str(ex))
                data_df = get_empty_dataframe(file)
            finally:    
                data_df.fillna(0, inplace=True)
                nirf_df = pd.concat([nirf_df, data_df])                              
    except Exception as ex:
        print(f"Error while processing the sheet 4 of the file: {file}")
        print(str(ex))
        nirf_df = get_empty_dataframe(file)
    finally:            
        nirf_df = nirf_df.groupby('Institution').sum('Total')
        nirf_df = nirf_df.transpose()
        nirf_df.rename(index={'Total': 'Total number of PhD students enrolled'}, inplace=True)
    
    return nirf_df

In [20]:
process_sheet4()

Institution,1 IR-O-U-0456,10 IR-O-U-0109,100 IR-O-U-0470,11 IR-O-U-0500,12 IR-O-U-0013,13 IR-O-U-0108,14 IR-O-U-0234,15 IR-O-U-0120,16 IR-O-U-0496,17 IR-O-U-0575,...,88 IR-O-U-0523,89 IR-O-U-0190,9 IR-O-U-0053,91 IR-O-U-0446,92 IR-O-U-0055,93 IR-O-U-0577,94 IR-O-U-0136,95 IR-O-U-0642,97 IR-O-U-0003,99 IR-O-U-0686
Total number of PhD students enrolled,2841,0,0,0,0,0,0,3502,0,0,...,0,254,0,1144,0,0,0,0,0,0


In [21]:
def process_sheet4_1():
    nirf_df = pd.DataFrame()
    
    try:    
        for file in files:
            try:
                if "7" in pd.ExcelFile(PROCESSED_FILE_DIRECTORY + '\\' + file).sheet_names:
                    #print(f"Processing the sheet 4 of the file: {file}")

                    excel_file = PROCESSED_FILE_DIRECTORY + '\\' + file
                    data_df = pd.read_excel(excel_file, sheet_name="7")

                    data_df = data_df.iloc[[5], 1:]
                    data_df.fillna(0, inplace=True)
                    
                    data_df['Unnamed: 0'] = data_df['Unnamed: 0'].astype(np.float64)
                    data_df['Unnamed: 1'] = data_df['Unnamed: 1'].astype(np.float64)
                    data_df['Unnamed: 2'] = data_df['Unnamed: 2'].astype(np.float64)                    

                    data_df['Total'] = (data_df['Unnamed: 0'] + data_df['Unnamed: 1'] + data_df['Unnamed: 2'])/3              
                    data_df = data_df[['Total', 'Institution']]
                else:
                    data_df = get_empty_dataframe(file)
            except Exception as ex:
                print(f"Error while processing the sheet 4 of the file: {file}")
                print(str(ex))
                data_df = get_empty_dataframe(file)
            finally:    
                data_df.fillna(0, inplace=True)
                nirf_df = pd.concat([nirf_df, data_df])            
    except Exception as ex:
        print(f"Error while processing the sheet 4 of the file: {file}")
        print(str(ex))
        nirf_df = get_empty_dataframe(file) 
    finally:            
        nirf_df = nirf_df.set_index('Institution')
        nirf_df.sort_index(inplace=True)
        #nirf_df = nirf_df.groupby('Institution').mean('Total')
        nirf_df = nirf_df.transpose()
        nirf_df.rename(index={'Total': 'Average Full Time PhD students graduated'}, inplace=True)
    
    return nirf_df

In [22]:
process_sheet4_1()

Institution,1 IR-O-U-0456,10 IR-O-U-0109,100 IR-O-U-0470,11 IR-O-U-0500,12 IR-O-U-0013,13 IR-O-U-0108,14 IR-O-U-0234,15 IR-O-U-0120,16 IR-O-U-0496,17 IR-O-U-0575,...,88 IR-O-U-0523,89 IR-O-U-0190,9 IR-O-U-0053,91 IR-O-U-0446,92 IR-O-U-0055,93 IR-O-U-0577,94 IR-O-U-0136,95 IR-O-U-0642,97 IR-O-U-0003,99 IR-O-U-0686
Average Full Time PhD students graduated,294.333333,0.0,0.0,0.0,0.0,0.0,0.0,792.333333,0.0,0.0,...,0.0,54.666667,0.0,71.333333,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
def process_sheet4_2():
    nirf_df = pd.DataFrame()
    
    try:    
        for file in files:
            try:
                if "7" in pd.ExcelFile(PROCESSED_FILE_DIRECTORY + '\\' + file).sheet_names:
                    #print(f"Processing the sheet 4 of the file: {file}")
    
                    excel_file = PROCESSED_FILE_DIRECTORY + '\\' + file
                    data_df = pd.read_excel(excel_file, sheet_name="7")
    
                    data_df = data_df.iloc[[6], 1:]
                    
                    data_df['Unnamed: 0'] = data_df['Unnamed: 0'].astype(np.float64)
                    data_df['Unnamed: 1'] = data_df['Unnamed: 1'].astype(np.float64)
                    data_df['Unnamed: 2'] = data_df['Unnamed: 2'].astype(np.float64)
                    
                    data_df['Total'] = (data_df['Unnamed: 0'] + data_df['Unnamed: 1'] + data_df['Unnamed: 2'])/3              
                    data_df = data_df[['Total', 'Institution']]
                    data_df.fillna(0, inplace=True)
                else:
                    data_df = get_empty_dataframe(file)
            except Exception as ex:
                print(f"Error while processing the sheet 4 of the file: {file}")
                print(str(ex))
                data_df = get_empty_dataframe(file)
            finally:    
                data_df.fillna(0, inplace=True)
                nirf_df = pd.concat([nirf_df, data_df])                            
    except Exception as ex:
        print(f"Error while processing the sheet 4 of the file: {file}")
        print(str(ex))
        nirf_df = get_empty_dataframe(file)   
    finally:            
        nirf_df = nirf_df.set_index('Institution')
        nirf_df.sort_index(inplace=True)
        nirf_df = nirf_df.transpose()
        nirf_df.rename(index={'Total': 'Average Part Time PhD students graduated'}, inplace=True)
    
    return nirf_df

In [24]:
process_sheet4_2()

Institution,1 IR-O-U-0456,10 IR-O-U-0109,100 IR-O-U-0470,11 IR-O-U-0500,12 IR-O-U-0013,13 IR-O-U-0108,14 IR-O-U-0234,15 IR-O-U-0120,16 IR-O-U-0496,17 IR-O-U-0575,...,88 IR-O-U-0523,89 IR-O-U-0190,9 IR-O-U-0053,91 IR-O-U-0446,92 IR-O-U-0055,93 IR-O-U-0577,94 IR-O-U-0136,95 IR-O-U-0642,97 IR-O-U-0003,99 IR-O-U-0686
Average Part Time PhD students graduated,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,135.666667,0.0,0.0,0.0,0.0,0.0,0.0


## Sheet 5

In [25]:
def get_empty_dataframe(file):
    data_df = pd.DataFrame({
                            '2022-23': [0],
                            '2021-22': [0],
                            '2020-21': [0],
                            'Institution': file.split('.')[0],
                            'Capex1': [0],
                            'Capex2': [0],
                            'Capex3': [0]
                        })
    return data_df

In [26]:
def process_sheet5():
    nirf_df = pd.DataFrame()
    
    try:
        for file in files:
            try:
                if "8" in pd.ExcelFile(PROCESSED_FILE_DIRECTORY + '\\' + file).sheet_names:
                    #print(f"Processing the sheet 5 of the file: {file}")

                    excel_file = PROCESSED_FILE_DIRECTORY + '\\' + file
                    data_df = pd.read_excel(excel_file, sheet_name="8")

                    data_df = data_df.iloc[2:, 1:]
                    data_df.fillna(0, inplace=True)

                    data_df['2022-23'] = data_df['2022-23'].str.split(r"(").str[0].astype(np.int64)

                    data_df['2021-22'] = data_df['2021-22'].str.split(r"(").str[0].astype(np.int64)

                    data_df['2020-21'] = data_df['2020-21'].str.split(r"(").str[0].astype(np.int64)

                    data_df['Capex1'] = round((np.sum(data_df['2022-23'])/100000),1)

                    data_df['Capex2'] = round((np.sum(data_df['2021-22'])/100000),1)

                    data_df['Capex3'] = round((np.sum(data_df['2020-21'])/100000),1)
                else:
                    data_df = get_empty_dataframe(file)
            except Exception as ex:
                print(f"Error while processing the sheet 5 of the file: {file}")
                print(str(ex))
                data_df = get_empty_dataframe(file)
            finally:
                nirf_df = pd.concat([nirf_df, data_df])       
    except Exception as ex:
        print(f"Error while processing the sheet 5 of the file: {file}")
        print(str(ex))
        nirf_df = get_empty_dataframe(file)     
    finally:            
        nirf_df = nirf_df[['Capex1', 'Capex2', 'Capex3', 'Institution']].sort_values('Institution')
        nirf_df = nirf_df.set_index('Institution')
        nirf_df = nirf_df.groupby('Institution').head(1)
        nirf_df = nirf_df.transpose()      
    
    return nirf_df

In [27]:
process_sheet5()

Institution,1 IR-O-U-0456,10 IR-O-U-0109,100 IR-O-U-0470,11 IR-O-U-0500,12 IR-O-U-0013,13 IR-O-U-0108,14 IR-O-U-0234,15 IR-O-U-0120,16 IR-O-U-0496,17 IR-O-U-0575,...,88 IR-O-U-0523,89 IR-O-U-0190,9 IR-O-U-0053,91 IR-O-U-0446,92 IR-O-U-0055,93 IR-O-U-0577,94 IR-O-U-0136,95 IR-O-U-0642,97 IR-O-U-0003,99 IR-O-U-0686
Capex1,62000.9,0.0,0.0,0.0,0.0,0.0,0.0,8351.6,0.0,0.0,...,0.0,931.4,0.0,30713.2,0.0,0.0,0.0,0.0,0.0,0.0
Capex2,58448.8,0.0,0.0,0.0,0.0,0.0,0.0,3624.7,0.0,0.0,...,0.0,225.9,0.0,22172.8,0.0,0.0,0.0,0.0,0.0,0.0
Capex3,17971.5,0.0,0.0,0.0,0.0,0.0,0.0,5881.7,0.0,0.0,...,0.0,145.9,0.0,13208.8,0.0,0.0,0.0,0.0,0.0,0.0


## Sheet 6

In [28]:
def get_empty_dataframe(file):    
    data_df = pd.DataFrame({
                        '2022-23': [0],
                        '2021-22': [0],
                        '2020-21': [0],
                        'Institution': file.split('.')[0],
                        'Opex1': [0],
                        'Opex2': [0],
                        'Opex3': [0]
                    })
    return data_df

In [29]:
def process_sheet6():
    nirf_df = pd.DataFrame()
    
    try:
        for file in files:
            try:
                if "9" in pd.ExcelFile(PROCESSED_FILE_DIRECTORY + '\\' + file).sheet_names:
                    #print(f"Processing the sheet 6 of the file: {file}")

                    excel_file = PROCESSED_FILE_DIRECTORY + '\\' + file
                    data_df = pd.read_excel(excel_file, sheet_name="9")

                    data_df = data_df.iloc[2:, 1:]
                    data_df.fillna(0, inplace=True)
                    
                    data_df['2022-23'] = data_df['2022-23'].str.split(r"(").str[0].astype(np.int64)

                    data_df['2021-22'] = data_df['2021-22'].str.split(r"(").str[0].astype(np.int64)

                    data_df['2020-21'] = data_df['2020-21'].str.split(r"(").str[0].astype(np.int64)

                    data_df['Opex1'] = round((np.sum(data_df['2022-23'])/1000000),1)

                    data_df['Opex2'] = round((np.sum(data_df['2021-22'])/1000000),1)

                    data_df['Opex3'] = round((np.sum(data_df['2020-21'])/1000000),1)                
                else:
                    data_df = get_empty_dataframe(file)
            except Exception as ex:
                print(f"Error while processing the sheet 6 of the file: {file}")
                print(str(ex))
                data_df = get_empty_dataframe(file)
            finally:
                nirf_df = pd.concat([nirf_df, data_df])               
    except Exception as ex:
        print(f"Error while processing the sheet 6 of the file: {file}")
        print(str(ex))
        nirf_df = get_empty_dataframe(file)
    finally:            
        nirf_df = nirf_df[['Opex1', 'Opex2', 'Opex3', 'Institution']].sort_values('Institution')
        nirf_df = nirf_df.set_index('Institution')
        nirf_df = nirf_df.groupby('Institution').head(1)
        nirf_df = nirf_df.transpose() 
    
    return nirf_df

In [30]:
process_sheet6()

Institution,1 IR-O-U-0456,10 IR-O-U-0109,100 IR-O-U-0470,11 IR-O-U-0500,12 IR-O-U-0013,13 IR-O-U-0108,14 IR-O-U-0234,15 IR-O-U-0120,16 IR-O-U-0496,17 IR-O-U-0575,...,88 IR-O-U-0523,89 IR-O-U-0190,9 IR-O-U-0053,91 IR-O-U-0446,92 IR-O-U-0055,93 IR-O-U-0577,94 IR-O-U-0136,95 IR-O-U-0642,97 IR-O-U-0003,99 IR-O-U-0686
Opex1,14246.1,0.0,0.0,0.0,0.0,0.0,0.0,13333.7,0.0,0.0,...,0.0,493.0,0.0,4999.1,0.0,0.0,0.0,0.0,0.0,0.0
Opex2,14231.4,0.0,0.0,0.0,0.0,0.0,0.0,10233.0,0.0,0.0,...,0.0,336.4,0.0,4925.8,0.0,0.0,0.0,0.0,0.0,0.0
Opex3,10328.5,0.0,0.0,0.0,0.0,0.0,0.0,6760.7,0.0,0.0,...,0.0,261.4,0.0,3093.8,0.0,0.0,0.0,0.0,0.0,0.0


## Sheet 7

In [31]:
def get_empty_dataframe(file):         
    data_df = pd.DataFrame({
                        'Sponsored Average': [0], 
                        'Institution': file.split('.')[0]
                    })
    return data_df

In [32]:
def process_sheet7():
    nirf_df = pd.DataFrame()
    
    try:
        for file in files:
            try:
                if "10" in pd.ExcelFile(PROCESSED_FILE_DIRECTORY + '\\' + file).sheet_names:
                    #print(f"Processing the sheet 7 of the file: {file}")

                    excel_file = PROCESSED_FILE_DIRECTORY + '\\' + file
                    data_df = pd.read_excel(excel_file, sheet_name="10")

                    data_df = data_df.iloc[[0], 1:]
                    data_df.fillna(0, inplace=True)
                    
                    data_df['2022-23'] = data_df['2022-23'].astype(np.float64)
                    data_df['2021-22'] = data_df['2021-22'].astype(np.float64)
                    data_df['2020-21'] = data_df['2020-21'].astype(np.float64)

                    data_df['Sponsored Average'] = (data_df['2022-23'] + data_df['2021-22'] + data_df['2020-21'])/3
                    data_df = data_df[['Sponsored Average', 'Institution']]
                else:
                    data_df = get_empty_dataframe(file)
            except Exception as ex:
                print(f"Error while processing the sheet 3 of the file: {file}")
                print(str(ex))
                data_df = get_empty_dataframe(file)
            finally:
                nirf_df = pd.concat([nirf_df, data_df])   
    except Exception as ex:
        print(f"Error while processing the sheet 7 of the file: {file}")
        print(str(ex))
        nirf_df = get_empty_dataframe(file)
    finally:            
        nirf_df = nirf_df.groupby('Institution').mean('Sponsored Average')
        nirf_df = nirf_df.transpose() 
    
    return nirf_df

In [33]:
process_sheet7()

Institution,1 IR-O-U-0456,10 IR-O-U-0109,100 IR-O-U-0470,11 IR-O-U-0500,12 IR-O-U-0013,13 IR-O-U-0108,14 IR-O-U-0234,15 IR-O-U-0120,16 IR-O-U-0496,17 IR-O-U-0575,...,88 IR-O-U-0523,89 IR-O-U-0190,9 IR-O-U-0053,91 IR-O-U-0446,92 IR-O-U-0055,93 IR-O-U-0577,94 IR-O-U-0136,95 IR-O-U-0642,97 IR-O-U-0003,99 IR-O-U-0686
Sponsored Average,686.333333,0.0,0.0,0.0,0.0,0.0,0.0,477.0,0.0,0.0,...,0.0,10.0,0.0,156.666667,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
def process_sheet7_1():
    nirf_df = pd.DataFrame()
    
    try:
        for file in files:
            if "11" in pd.ExcelFile(PROCESSED_FILE_DIRECTORY + '\\' + file).sheet_names:
                #print(f"Processing the sheet 8 of the file: {file}")

                excel_file = PROCESSED_FILE_DIRECTORY + '\\' + file
                data_df = pd.read_excel(excel_file, sheet_name="11")

                data_df = data_df.iloc[[0], 1:]
                data_df['2022-23'] = data_df['2022-23'].astype(np.float64)
                data_df['2021-22'] = data_df['2021-22'].astype(np.float64)
                data_df['2020-21'] = data_df['2020-21'].astype(np.float64)

                data_df['Consultancy Average'] = (data_df['2022-23'] + data_df['2021-22'] + data_df['2020-21'])/3
                data_df = data_df[['Consultancy Average', 'Institution']]
            else:
                data_df = get_empty_dataframe(file)
                
            data_df.fillna(0, inplace=True)        
            nirf_df = pd.concat([nirf_df, data_df])
    except Exception as ex:
        print(f"Error while processing the sheet 7 of the file: {file}")
        print(str(ex))
        nirf_df = get_empty_dataframe(file) 
    finally:        
        nirf_df = nirf_df.groupby('Institution').mean('Consultancy Average')
        nirf_df = nirf_df.transpose()
    
    return nirf_df

In [35]:
process_sheet7_1()

Institution,1 IR-O-U-0456,10 IR-O-U-0109,100 IR-O-U-0470,11 IR-O-U-0500,12 IR-O-U-0013,13 IR-O-U-0108,14 IR-O-U-0234,15 IR-O-U-0120,16 IR-O-U-0496,17 IR-O-U-0575,...,88 IR-O-U-0523,89 IR-O-U-0190,9 IR-O-U-0053,91 IR-O-U-0446,92 IR-O-U-0055,93 IR-O-U-0577,94 IR-O-U-0136,95 IR-O-U-0642,97 IR-O-U-0003,99 IR-O-U-0686
Consultancy Average,1034.666667,,,,,,,6.666667,,,...,,7.666667,,320.333333,,,,,,
Sponsored Average,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,...,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
def process_sheet7_2():
    nirf_df = pd.DataFrame()
    
    try:
        for file in files:
            if "12" in pd.ExcelFile(PROCESSED_FILE_DIRECTORY + '\\' + file).sheet_names:
                #print(f"Processing the sheet 8 of the file: {file}")

                excel_file = PROCESSED_FILE_DIRECTORY + '\\' + file
                data_df = pd.read_excel(excel_file, sheet_name="12")

                data_df = data_df.iloc[[0], 1:]
                data_df['2022-23'] = data_df['2022-23'].astype(np.float64)
                data_df['2021-22'] = data_df['2021-22'].astype(np.float64)
                data_df['2020-21'] = data_df['2020-21'].astype(np.float64)

                data_df['Executive Average'] = (data_df['2022-23'] + data_df['2021-22'] + data_df['2020-21'])/3
                data_df = data_df[['Executive Average', 'Institution']]
            else:
                data_df = get_empty_dataframe(file)
                
            data_df.fillna(0, inplace=True)        
            nirf_df = pd.concat([nirf_df, data_df])
    except Exception as ex:
        print(f"Error while processing the sheet 7 of the file: {file}")
        print(str(ex))
        nirf_df = get_empty_dataframe(file) 
    finally:        
        nirf_df = nirf_df.groupby('Institution').mean('Executive Average')
        nirf_df = nirf_df.transpose()
    
    return nirf_df

In [38]:
nirf_df = pd.DataFrame()

data = process_sheet0()
nirf_df = pd.concat([nirf_df, data])

data = process_sheet1()
nirf_df = pd.concat([nirf_df, data])

data = process_sheet1_1()
nirf_df = pd.concat([nirf_df, data])

data = process_sheet1_2()
nirf_df = pd.concat([nirf_df, data])

data = process_sheet2()
nirf_df = pd.concat([nirf_df, data])

data = process_sheet4()
nirf_df = pd.concat([nirf_df, data])

data = process_sheet4_1()
nirf_df = pd.concat([nirf_df, data])

data = process_sheet4_2()
nirf_df = pd.concat([nirf_df, data])

data = process_sheet5()
nirf_df = pd.concat([nirf_df, data])

data = process_sheet6()
nirf_df = pd.concat([nirf_df, data])

data = process_sheet7()
nirf_df = pd.concat([nirf_df, data])

data = process_sheet7_1()
nirf_df = pd.concat([nirf_df, data])

data = process_sheet7_2()
nirf_df = pd.concat([nirf_df, data])

nirf_df = nirf_df.transpose()

nirf_df['capexperstudavg'] = (nirf_df['Capex1'] + \
                              nirf_df['Capex2'] + nirf_df['Capex3'])/ \
(3 * (nirf_df['Total sanctioned approved intake for all programs'] + nirf_df['Total number of PhD students enrolled']))

nirf_df['opexperstudavg'] = (nirf_df['Opex1'] + \
                              nirf_df['Opex2'] + nirf_df['Opex3'])/ \
(3 * (nirf_df['Total sanctioned approved intake for all programs'] + nirf_df['Total number of PhD students enrolled']))

nirf_df = nirf_df.transpose()

nirf_df.fillna(0, inplace=True)

nirf_df

Institution,1 IR-O-U-0456,10 IR-O-U-0109,100 IR-O-U-0470,11 IR-O-U-0500,12 IR-O-U-0013,13 IR-O-U-0108,14 IR-O-U-0234,15 IR-O-U-0120,16 IR-O-U-0496,17 IR-O-U-0575,...,88 IR-O-U-0523,89 IR-O-U-0190,9 IR-O-U-0053,91 IR-O-U-0446,92 IR-O-U-0055,93 IR-O-U-0577,94 IR-O-U-0136,95 IR-O-U-0642,97 IR-O-U-0003,99 IR-O-U-0686
Total sanctioned approved intake for all programs,7038.0,5118.0,2700.0,29439.0,3008.0,8646.0,26488.0,23676.0,19380.0,11548.0,...,2859.0,4121.0,5635.0,20629.0,4306.0,4604.0,3540.0,2595.0,5240.0,1273.0
Total number of students enrolled in all programs,7397.0,5199.0,2245.0,29466.0,2788.0,8730.0,24376.0,22894.0,18782.0,10394.0,...,2574.0,3851.0,5678.0,19448.0,4040.0,4165.0,3537.0,2745.0,5227.0,1192.0
No. of students\rwho are not\rreceiving full\rtuition fee\rreimbursement,0.0,3463.0,154.0,10523.0,695.0,29.0,1478.0,10450.0,4171.0,166.0,...,120.0,880.0,1464.0,5897.0,512.0,21.0,0.0,59.0,157.0,413.0
No. of students_x000D_receiving full_x000D_tuition fee_x000D_reimbursement_x000D_from the State_x000D_and Central_x000D_Government,308.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
No. of students_x000D_receiving full_x000D_tuition fee_x000D_reimbursement_x000D_from Institution_x000D_Funds,4066.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
No. of students_x000D_receiving full_x000D_tuition fee_x000D_reimbursement_x000D_from the Private_x000D_Bodies,527.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
No. of students\rreceiving full\rtuition fee\rreimbursement\rfrom the State\rand Central\rGovernment,0.0,0.0,1773.0,7395.0,129.0,5207.0,123.0,331.0,3873.0,5436.0,...,1402.0,69.0,1756.0,580.0,457.0,323.0,2144.0,0.0,5070.0,130.0
No. of students\rreceiving full\rtuition fee\rreimbursement\rfrom Institution\rFunds,0.0,262.0,183.0,15.0,676.0,1901.0,1450.0,1382.0,855.0,1453.0,...,0.0,60.0,0.0,1635.0,1489.0,2355.0,16.0,289.0,0.0,0.0
No. of students\rreceiving full\rtuition fee\rreimbursement\rfrom the Private\rBodies,0.0,0.0,90.0,153.0,2.0,1005.0,0.0,1.0,2862.0,335.0,...,0.0,12.0,0.0,758.0,14.0,12.0,0.0,42.0,0.0,90.0
Median salary,1621147.0,666666.666667,246934.666667,750000.0,1908333.0,510000.0,333333.333333,500000.0,490262.0,578333.333333,...,966666.666667,249666.666667,2093333.0,398333.333333,1081667.0,953333.333333,53058.823529,870333.333333,766666.666667,820000.0


In [39]:
nirf_df.to_excel(REPORT_FILE_DIRECTORY + "/PredictionReport.xlsx", freeze_panes=(1, 1),)