In [48]:
import os
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

# Define the folder containing the Excel files
input_folder_name = "./NEET_2024_Excel/Individual_Excel/"
output_folder_name = "Merged_Excel"

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder_name):
    os.makedirs(output_folder_name)

# List all Excel files in the folder
excel_files = [f for f in os.listdir(input_folder_name) if f.endswith('.xlsx')]

# Initialize an empty list to store DataFrames
data_frames = []


# Read each Excel file and append to the list
for file in tqdm(excel_files):
    file_path = os.path.join(input_folder_name, file)
    df = pd.read_excel(file_path)
    data_frames.append(df)

# Concatenate all DataFrames into a single DataFrame
merged_df = pd.concat(data_frames, ignore_index=True)

# Save the merged DataFrame to a new Excel file
merged_file_path = os.path.join(output_folder_name, "Merged_NEET_2024.xlsx")

print(f"Merged file saved to: {merged_file_path}")


100%|██████████| 4700/4700 [01:39<00:00, 47.11it/s]


Merged file saved to: Merged_Excel/Merged_NEET_2024.xlsx


In [49]:
!pip3 install streamlit plotly pandas


[33mDEPRECATION: Loading egg at /opt/homebrew/lib/python3.11/site-packages/GHCNpy-1.0-py3.11.egg is deprecated. pip 23.3 will enforce this behaviour change. A possible replacement is to use pip for package installation..[0m[33m
[0m[33mDEPRECATION: Loading egg at /opt/homebrew/lib/python3.11/site-packages/netCDF4-1.6.3-py3.11-macosx-13-arm64.egg is deprecated. pip 23.3 will enforce this behaviour change. A possible replacement is to use pip for package installation..[0m[33m
[0m[33mDEPRECATION: Loading egg at /opt/homebrew/lib/python3.11/site-packages/cftime-1.6.2-py3.11-macosx-13-arm64.egg is deprecated. pip 23.3 will enforce this behaviour change. A possible replacement is to use pip for package installation..[0m[33m
[0m[33mDEPRECATION: Loading egg at /opt/homebrew/lib/python3.11/site-packages/geographiclib-2.0-py3.11.egg is deprecated. pip 23.3 will enforce this behaviour change. A possible replacement is to use pip for package installation..[0m[33m
[0m[33mDEPRECATION

In [50]:
merged_df 

Unnamed: 0,Srlno.,Marks,Center No,Center Name,City Code
0,1,25,311256,311256 Page No. 1 PINNACLE INTERNATIONAL SCH...,3112
1,2,48,311256,311256 Page No. 1 PINNACLE INTERNATIONAL SCH...,3112
2,3,206,311256,311256 Page No. 1 PINNACLE INTERNATIONAL SCH...,3112
3,4,-7,311256,311256 Page No. 1 PINNACLE INTERNATIONAL SCH...,3112
4,5,134,311256,311256 Page No. 1 PINNACLE INTERNATIONAL SCH...,3112
...,...,...,...,...,...
2300204,259,274,301804,301804 Page No. 1 ST. GEORGE ENGLISH MEDIUM ...,3018
2300205,260,221,301804,301804 Page No. 1 ST. GEORGE ENGLISH MEDIUM ...,3018
2300206,261,45,301804,301804 Page No. 1 ST. GEORGE ENGLISH MEDIUM ...,3018
2300207,262,70,301804,301804 Page No. 1 ST. GEORGE ENGLISH MEDIUM ...,3018


In [51]:
import re

df = merged_df

# Function to clean the center name
def clean_center_name(center_name):
    # Remove the initial center code and page number
    cleaned_name = re.sub(r'^\d+\s+Page\s+No\.\s+\d+\s+', '', center_name)
    # Remove trailing "NEET (UG) 2024"
    cleaned_name = re.sub(r'\s+NEET\s+\(UG\)\s+2024$', '', cleaned_name)
    return cleaned_name

# Apply the cleaning function to the 'Center Name' column
df['Cleaned Center Name'] = df['Center Name'].apply(clean_center_name)



In [52]:
df.head()['Cleaned Center Name'].to_list()

['PINNACLE INTERNATIONAL SCHOOL WING B, KAKANDI TARFE PASADGAON NEAR SAI LAWNS MALEGAON ROAD NANDED, NANDED, MAHARASHTRA',
 'PINNACLE INTERNATIONAL SCHOOL WING B, KAKANDI TARFE PASADGAON NEAR SAI LAWNS MALEGAON ROAD NANDED, NANDED, MAHARASHTRA',
 'PINNACLE INTERNATIONAL SCHOOL WING B, KAKANDI TARFE PASADGAON NEAR SAI LAWNS MALEGAON ROAD NANDED, NANDED, MAHARASHTRA',
 'PINNACLE INTERNATIONAL SCHOOL WING B, KAKANDI TARFE PASADGAON NEAR SAI LAWNS MALEGAON ROAD NANDED, NANDED, MAHARASHTRA',
 'PINNACLE INTERNATIONAL SCHOOL WING B, KAKANDI TARFE PASADGAON NEAR SAI LAWNS MALEGAON ROAD NANDED, NANDED, MAHARASHTRA']

In [53]:
# read centers df.to_csv('neet_exam_centres.csv', index=False)

neet_exam_centres = pd.read_csv('neet_exam_centres.csv')

neet_exam_centres.columns

Index(['S. No.', 'City Code', 'State', 'District', 'City'], dtype='object')

In [54]:
#merge the df with neet exam centers on City Code

final_df = pd.merge(df, neet_exam_centres, on='City Code', how='left')
# cleaned center name is the center name from the neet exam centers
final_df = final_df.drop(columns = ['Center Name'])
final_df = final_df.rename(columns={'Cleaned Center Name': 'Center Name'})
# drop S. No 
final_df = final_df.drop(columns = ['S. No.'])

final_df 

Unnamed: 0,Srlno.,Marks,Center No,City Code,Center Name,State,District,City
0,1,25,311256,3112,"PINNACLE INTERNATIONAL SCHOOL WING B, KAKANDI ...",Maharashtra,Nanded,Nanded
1,2,48,311256,3112,"PINNACLE INTERNATIONAL SCHOOL WING B, KAKANDI ...",Maharashtra,Nanded,Nanded
2,3,206,311256,3112,"PINNACLE INTERNATIONAL SCHOOL WING B, KAKANDI ...",Maharashtra,Nanded,Nanded
3,4,-7,311256,3112,"PINNACLE INTERNATIONAL SCHOOL WING B, KAKANDI ...",Maharashtra,Nanded,Nanded
4,5,134,311256,3112,"PINNACLE INTERNATIONAL SCHOOL WING B, KAKANDI ...",Maharashtra,Nanded,Nanded
...,...,...,...,...,...,...,...,...
2300204,259,274,301804,3018,"ST. GEORGE ENGLISH MEDIUM SCHOOL, ST. GEORGE E...",Madhya Pradesh,Dhar,Dhar
2300205,260,221,301804,3018,"ST. GEORGE ENGLISH MEDIUM SCHOOL, ST. GEORGE E...",Madhya Pradesh,Dhar,Dhar
2300206,261,45,301804,3018,"ST. GEORGE ENGLISH MEDIUM SCHOOL, ST. GEORGE E...",Madhya Pradesh,Dhar,Dhar
2300207,262,70,301804,3018,"ST. GEORGE ENGLISH MEDIUM SCHOOL, ST. GEORGE E...",Madhya Pradesh,Dhar,Dhar


In [55]:
# save the final df to a csv file
final_df.to_csv('all_data_neet_2024.csv', index=False)

In [35]:
final_df.columns

Index(['Srlno.', 'Marks', 'Center No', 'City Code', 'Center Name', 'State',
       'District', 'City'],
      dtype='object')

In [42]:
final_df

Unnamed: 0,Srlno.,Marks,Center No,City Code,Center Name,State,District,City
0,1,25,311256,3112,"PINNACLE INTERNATIONAL SCHOOL WING B, KAKANDI ...",Maharashtra,Nanded,Nanded
1,2,48,311256,3112,"PINNACLE INTERNATIONAL SCHOOL WING B, KAKANDI ...",Maharashtra,Nanded,Nanded
2,3,206,311256,3112,"PINNACLE INTERNATIONAL SCHOOL WING B, KAKANDI ...",Maharashtra,Nanded,Nanded
3,4,-7,311256,3112,"PINNACLE INTERNATIONAL SCHOOL WING B, KAKANDI ...",Maharashtra,Nanded,Nanded
4,5,134,311256,3112,"PINNACLE INTERNATIONAL SCHOOL WING B, KAKANDI ...",Maharashtra,Nanded,Nanded
...,...,...,...,...,...,...,...,...
1485980,259,274,301804,3018,"ST. GEORGE ENGLISH MEDIUM SCHOOL, ST. GEORGE E...",Madhya Pradesh,Dhar,Dhar
1485981,260,221,301804,3018,"ST. GEORGE ENGLISH MEDIUM SCHOOL, ST. GEORGE E...",Madhya Pradesh,Dhar,Dhar
1485982,261,45,301804,3018,"ST. GEORGE ENGLISH MEDIUM SCHOOL, ST. GEORGE E...",Madhya Pradesh,Dhar,Dhar
1485983,262,70,301804,3018,"ST. GEORGE ENGLISH MEDIUM SCHOOL, ST. GEORGE E...",Madhya Pradesh,Dhar,Dhar


In [43]:
import pandas as pd
import plotly.express as px

# Define the cutoff marks
cutoff_marks = 653

# Filter the DataFrame to get entries with marks above the cutoff
top_centers_df = final_df[final_df['Marks'] > cutoff_marks]

# Check the structure of top_centers_df
print(top_centers_df.head())

# Create visualizations
# Set the plotly template
template = 'plotly_dark'

# Plot the number of entries per state for top centers
state_counts = top_centers_df['State'].value_counts().reset_index()
state_counts.columns = ['State', 'Number of Entries']

fig_state = px.bar(state_counts, x='State', y='Number of Entries', title='Top Centers per State', template=template)
fig_state.show()

# Plot the number of entries per district for top centers
district_counts = top_centers_df['District'].value_counts().reset_index()
district_counts.columns = ['District', 'Number of Entries']

fig_district = px.bar(district_counts, x='District', y='Number of Entries', title='Top Centers per District', template=template)
fig_district.show()

# Plot the number of entries per city for top centers
city_counts = top_centers_df['City'].value_counts().reset_index()
city_counts.columns = ['City', 'Number of Entries']

fig_city = px.bar(city_counts, x='City', y='Number of Entries', title='Top Centers per City', template=template)
fig_city.show()

# Plot the distribution of marks for top centers
fig_marks = px.histogram(top_centers_df, x='Marks', nbins=10, title='Distribution of Marks for Top Centers', template=template)
fig_marks.show()

# Save the interactive plots as HTML files
fig_state.write_html("top_centers_per_state.html")
fig_district.write_html("top_centers_per_district.html")
fig_city.write_html("top_centers_per_city.html")
fig_marks.write_html("distribution_of_marks_top_centers.html")

print("Interactive plots saved successfully.")


     Srlno.  Marks  Center No  City Code  \
72       73    661     311256       3112   
196      27    685     390511       3905   
228      59    710     390511       3905   
236      67    654     390511       3905   
251      82    682     390511       3905   

                                           Center Name        State District  \
72   PINNACLE INTERNATIONAL SCHOOL WING B, KAKANDI ...  Maharashtra   Nanded   
196  BAKHSHIS SPRINGDALES SR. SEC. SCHOOL, DEOLI AR...    Rajasthan     Kota   
228  BAKHSHIS SPRINGDALES SR. SEC. SCHOOL, DEOLI AR...    Rajasthan     Kota   
236  BAKHSHIS SPRINGDALES SR. SEC. SCHOOL, DEOLI AR...    Rajasthan     Kota   
251  BAKHSHIS SPRINGDALES SR. SEC. SCHOOL, DEOLI AR...    Rajasthan     Kota   

       City  
72   Nanded  
196    Kota  
228    Kota  
236    Kota  
251    Kota  


Interactive plots saved successfully.


In [45]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

# Define the cutoff marks
cutoff_marks = 653

# Filter the DataFrame to get entries with marks above the cutoff
top_centers_df = final_df[final_df['Marks'] > cutoff_marks]

# Create visualizations
template = 'plotly_dark'

# Plot the number of entries per state for top centers
state_counts = top_centers_df['State'].value_counts().reset_index()
state_counts.columns = ['State', 'Number of Entries']

fig_state = px.bar(state_counts, x='State', y='Number of Entries', 
                   title='Top Centers per State', template=template,
                   labels={'State': 'State', 'Number of Entries': 'Number of Entries'},
                   text='Number of Entries')
fig_state.update_traces(texttemplate='%{text}', textposition='outside')
fig_state.update_layout(showlegend=False, margin=dict(t=40, b=40, l=40, r=40))
fig_state.show()

# Plot the number of entries per district for top centers
district_counts = top_centers_df['District'].value_counts().reset_index()
district_counts.columns = ['District', 'Number of Entries']

fig_district = px.bar(district_counts, x='District', y='Number of Entries', 
                      title='Top Centers per District', template=template,
                      labels={'District': 'District', 'Number of Entries': 'Number of Entries'},
                      text='Number of Entries')
fig_district.update_traces(texttemplate='%{text}', textposition='outside')
fig_district.update_layout(showlegend=False, margin=dict(t=40, b=40, l=40, r=40))
fig_district.show()

# Plot the number of entries per city for top centers
city_counts = top_centers_df['City'].value_counts().reset_index()
city_counts.columns = ['City', 'Number of Entries']

fig_city = px.bar(city_counts, x='City', y='Number of Entries', 
                  title='Top Centers per City', template=template,
                  labels={'City': 'City', 'Number of Entries': 'Number of Entries'},
                  text='Number of Entries')
fig_city.update_traces(texttemplate='%{text}', textposition='outside')
fig_city.update_layout(showlegend=False, margin=dict(t=40, b=40, l=40, r=40))
fig_city.show()

# Plot the distribution of marks for top centers with appropriate bins
fig_marks = px.histogram(top_centers_df, x='Marks', nbins=20, 
                         title='Distribution of Marks for Top Centers', template=template,
                         labels={'Marks': 'Marks', 'count': 'Frequency'})
fig_marks.update_layout(showlegend=False, margin=dict(t=40, b=40, l=40, r=40))
fig_marks.show()

# Plot the number of entries per center for top centers
center_counts = top_centers_df['Center No'].value_counts().reset_index()
center_counts.columns = ['Center No', 'Number of Entries']

fig_center = px.bar(center_counts, x='Center No', y='Number of Entries', 
                    title='Top Centers per Center', template=template,
                    labels={'Center No': 'Center No', 'Number of Entries': 'Number of Entries'},
                    text='Number of Entries')
fig_center.update_traces(texttemplate='%{text}', textposition='outside')
fig_center.update_layout(showlegend=False, margin=dict(t=40, b=40, l=40, r=40))
fig_center.show()

# Save the interactive plots as HTML files
fig_state.write_html("top_centers_per_state.html")
fig_district.write_html("top_centers_per_district.html")
fig_city.write_html("top_centers_per_city.html")
fig_marks.write_html("distribution_of_marks_top_centers.html")
fig_center.write_html("top_centers_per_center.html")

# Add note and disclaimer
note = "Data scraped by @kushalkmrd."
disclaimer = "Disclaimer: The data presented here is for informational purposes only and is based on scraped data from public sources. Accuracy of the data is not guaranteed."

print("Interactive plots saved successfully.")

# Creating filterable tables using Plotly's dash_table
import dash
import dash_table
from dash import dcc, html

app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1('Top Centers Data Analysis', style={'textAlign': 'center'}),
    html.P(note, style={'textAlign': 'center', 'fontSize': 12}),
    html.P(disclaimer, style={'textAlign': 'center', 'fontSize': 12, 'color': 'red'}),
    
    dash_table.DataTable(
        id='table',
        columns=[{"name": i, "id": i} for i in top_centers_df.columns],
        data=top_centers_df.to_dict('records'),
        filter_action='native',
        sort_action='native',
        page_size=20
    ),
    
    dcc.Graph(figure=fig_state),
    dcc.Graph(figure=fig_district),
    dcc.Graph(figure=fig_city),
    dcc.Graph(figure=fig_marks),
    dcc.Graph(figure=fig_center),
])

if __name__ == '__main__':
    app.run_server(debug=True)


Interactive plots saved successfully.


In [None]:
import pandas as pd
import plotly.express as px

# Define the cutoff marks
cutoff_marks = 653

# Filter the DataFrame to get entries with marks above the cutoff
top_centers_df = final_df[final_df['Marks'] > cutoff_marks]

# Check the structure of top_centers_df
print(top_centers_df.head())

# Create visualizations
# Set the plotly template
template = 'plotly_dark'

# Plot the number of entries per state for top centers
state_counts = top_centers_df['State'].value_counts().reset_index()
state_counts.columns = ['State', 'Number of Entries']

fig_state = px.bar(state_counts, x='State', y='Number of Entries', title='Top Centers per State', template=template)
fig_state.show()

# Plot the number of entries per district for top centers
district_counts = top_centers_df['District'].value_counts().reset_index()
district_counts.columns = ['District', 'Number of Entries']

fig_district = px.bar(district_counts, x='District', y='Number of Entries', title='Top Centers per District', template=template)
fig_district.show()

# Plot the number of entries per city for top centers
city_counts = top_centers_df['City'].value_counts().reset_index()
city_counts.columns = ['City', 'Number of Entries']

fig_city = px.bar(city_counts, x='City', y='Number of Entries', title='Top Centers per City', template=template)
fig_city.show()

# Plot the distribution of marks for top centers
fig_marks = px.histogram(top_centers_df, x='Marks', nbins=10, title='Distribution of Marks for Top Centers', template=template)
fig_marks.show()

# Save the interactive plots as HTML files
fig_state.write_html("top_centers_per_state.html")
fig_district.write_html("top_centers_per_district.html")
fig_city.write_html("top_centers_per_city.html")
fig_marks.write_html("distribution_of_marks_top_centers.html")

print("Interactive plots saved successfully.")


     Srlno.  Marks  Center No  City Code  \
72       73    661     311256       3112   
196      27    685     390511       3905   
228      59    710     390511       3905   
236      67    654     390511       3905   
251      82    682     390511       3905   

                                           Center Name        State District  \
72   PINNACLE INTERNATIONAL SCHOOL WING B, KAKANDI ...  Maharashtra   Nanded   
196  BAKHSHIS SPRINGDALES SR. SEC. SCHOOL, DEOLI AR...    Rajasthan     Kota   
228  BAKHSHIS SPRINGDALES SR. SEC. SCHOOL, DEOLI AR...    Rajasthan     Kota   
236  BAKHSHIS SPRINGDALES SR. SEC. SCHOOL, DEOLI AR...    Rajasthan     Kota   
251  BAKHSHIS SPRINGDALES SR. SEC. SCHOOL, DEOLI AR...    Rajasthan     Kota   

       City  
72   Nanded  
196    Kota  
228    Kota  
236    Kota  
251    Kota  


Interactive plots saved successfully.


In [56]:
final_df.columns

Index(['Srlno.', 'Marks', 'Center No', 'City Code', 'Center Name', 'State',
       'District', 'City'],
      dtype='object')

In [59]:
# filter marks more than 720 
# if marks more than 720, then get the MArks last three digits and replace it as marks 
# copy the final df to a new df

new_df = final_df.copy()

# if marks more than 720, then get the MArks last three digits and replace it as marks note it is int 

new_df['Marks'] = new_df['Marks'].apply(lambda x: int(str(x)[-3:]) if x > 720 else x)



Unnamed: 0,Srlno.,Marks,Center No,City Code,Center Name,State,District,City


In [61]:
new_df['Marks'].describe()

count    2.300209e+06
mean     2.169338e+02
std      1.662095e+02
min     -1.800000e+02
25%      8.700000e+01
50%      1.630000e+02
75%      3.130000e+02
max      7.200000e+02
Name: Marks, dtype: float64

In [62]:
# get how many got 720 
new_df[new_df['Marks'] == 720].shape[0]

61

In [67]:
#save newdf
new_df.to_csv('all_data_neet_2024.csv', index=False)

# save new_df into 4 csv files for ease of storage

# split the new_df into 4 csv files

# split the new_df into 4 csv files (2300209, 8)
df1 = new_df.iloc[:575052]
df2 = new_df.iloc[575052:1150104]
df3 = new_df.iloc[1150104:1725156]
df4 = new_df.iloc[1725156:]

df1.to_csv('all_data_neet_2024_1.csv', index=False)
df2.to_csv('all_data_neet_2024_2.csv', index=False)
df3.to_csv('all_data_neet_2024_3.csv', index=False)
df4.to_csv('all_data_neet_2024_4.csv', index=False)


In [None]:
# read the 4 csv files and merge into data 
data1 = pd.read_csv('all_data_neet_2024_1.csv')
data2 = pd.read_csv('all_data_neet_2024_2.csv')
data3 = pd.read_csv('all_data_neet_2024_3.csv')
data4 = pd.read_csv('all_data_neet_2024_4.csv')
data = pd.concat([data1, data2, data3, data4])

In [64]:
# compress and stor the csv file
!zip all_data_neet_2024.zip all_data_neet_2024.csv

  adding: all_data_neet_2024.csv (deflated 97%)
