In [353]:
import pandas as pd
import numpy as np

In [354]:
# Import tabula to read pdf file
import tabula

In [None]:
mak23 = tabula.read_pdf("examples/Mak2023-2024.pdf", pages="all")

In [None]:
# This data is rough and improperly structured
mak23

In [None]:
# Index the first table in the mak23 pdf
# Data is highly unordered and requires pre-processing
# A number of techniques shall be used to clean and prepare data into proper structured
# tabular format.
mak23[0]

In [None]:
# Drop missing data rows.
# It can be observed that index 3 contains what would have been column names: Form id, Index_no, ...
# It also contains original data index read as column: Unnamed: 0
data1 = mak23[0].dropna()

In [None]:
data1

In [None]:
# This returns the columns in the DataFrame
data1.columns

In [None]:
# drop column: Unnamed: 0
del data1["Unnamed: 0"]

In [None]:
len(data1['MAKERERE UIVERSITY'])

In [None]:
# Key data is in column Makerere University
# Employ python string processing methods to extracts such data

In [None]:
data1.iloc[0]

In [None]:
data1

In [None]:
data1.index

In [None]:
# Reindex the dataframe
data2 = data1.reindex([x for x in range(len(data1.index))])

In [None]:
data2

In [None]:
newIndex = [x for x in range(len(data1.index))]
type(newIndex)

In [None]:
data1

In [None]:
# A series of the total weights
weights = data1["Unnamed: 2"][1:]
weights

In [None]:
# A series of form IDs
formId = data1['Unnamed: 1'][1:]
formId

In [None]:
# Preprocess and extract migled data from 'MAKERERE UIVERSITY' column
data2 = data1['MAKERERE UIVERSITY']

In [None]:
# data[1]
data = [x for x in data2[1:]]
data

In [None]:
# Rewrting index,
DataColumns = ["Index No", "NAME", "GENDER", "UACE YEAR", "CODE", "DISTRICT", "COURSE CODE", "COURSE NAME"]

In [None]:
# Function to extract data from data2
def dataProcess(n):
#     if n >= 1:
    studentData = data[n].split()
    index_no = studentData[0]
    name = " ".join(studentData[1:-7])[:-1]
    gender = studentData[-8][-1]
    UACE_YEAR = int(studentData[-7])
    DistrictCode = int(studentData[-6])
    District = studentData[-5]
    CourseCode = studentData[-4]
    Course = ' '.join(studentData[-3:])
    return [index_no, name, gender, UACE_YEAR,
            DistrictCode, District, CourseCode, Course]

In [None]:
# Create a data frame with no data
df = pd.DataFrame([], columns=DataColumns)

In [None]:
df

In [None]:
data

In [None]:
studentList = []
for i in range(len(data)):
#     if i > 0:
    student = dataProcess(i)
    studentList.append(student)

In [None]:
studentList

In [None]:
df = pd.DataFrame(studentList, columns=DataColumns)

In [None]:
df

In [None]:
# Insert Total Weight Column into the DataFrame
df.insert(8, "Total Weight", pd.Series([x for x in weights], index=df.index))

In [None]:
# Insert formId Column as the first column
# Qn: Can we insert multiple columns at once🤔
df.insert(0, "Form ID", pd.Series([x for x in formId], index=df.index))

In [None]:
df

<h3>Assignment: Preprocess the remaining data tables to have them similar to the data above.</h3>

In [None]:
# Are there better ways of doing the above tasks?

# <h3>Data description</h3>

In [None]:
# which district had a higher intake and what is its frequency
df.DISTRICT.describe()

In [None]:
# In the data portion, there are 26 unique districts.
# Wakiso has oneof the highest frequency of student admissions.
# Budaka is one of the districts with the lowest frequency admissions.

In [None]:
# why is Budaka returned as the district with the lowest frequency
df.DISTRICT.min()

In [None]:
# why is Wakiso returned as the district with the highest frequency
# df.DISTRICT.max(include='all')

In [None]:
df.DISTRICT.value_counts()

In [None]:
# analysis by SEX

In [None]:
# More males than females were admitted
df.GENDER.value_counts()

In [None]:
# Count gender by district 
df_district_gender = pd.DataFrame(data = df.GENDER.value_counts(), index = list(df.DISTRICT), columns = list(df.GENDER))

In [None]:
# How can we get district gender count?

In [None]:
# Extract one record
record0 = df.iloc[0]

In [None]:
# Extracting multiple column data
record0[["Form ID", "NAME"]]

In [None]:
record0

In [None]:
record1 = df.iloc[1]

In [None]:
record1

In [None]:
# you can concatenate records from different dataframes (sources)
# For example, in the admission data, you what to get a dataframe for students from a specific district.
newdf = pd.concat([record0, record1], keys=df.index).unstack()

In [None]:
newdf

In [None]:
# This is an extract of students from Wakiso
wakisoStudents = df.loc[df["DISTRICT"] == "WAKISO"]

In [None]:
# function definition
def district(district):
    return df.loc[df["DISTRICT"] == district].GENDER.value_counts()

In [None]:
district("WAKISO")

In [None]:
wakisoStudents

In [None]:
# Value Count: Returns count of distinct values
wakisoStudents["GENDER"].value_counts()

In [None]:
districts = pd.Series(df.DISTRICT)

In [None]:
districts

In [None]:
distDict = {}
for i in districts:
#     print(i, district(i))
    distDict[i] = dict(district(i))

In [None]:
# by default, unstack works for the inner index in hierarchical index.
distResult = pd.DataFrame(distDict).unstack()

In [None]:
# distResult

In [None]:
distGender = pd.DataFrame(distResult)

In [None]:
# distGender.columns.names = ["Enrollment"]

In [None]:
distGender.index.names = ["District", "Gender"]

In [None]:
# Renaming column to count
distGender = distGender.rename(columns = {0: "Count"})

In [None]:
distGender = distGender.fillna(0)

In [None]:
# Indexing the first two rows
distGender[:3]

In [None]:
# Indexing a specific district
distGender.loc["IGANGA"]

In [None]:
district("MASAKA")

In [None]:
wakisoStudents

In [None]:
kampalaAdmissions = df[df.DISTRICT == 'KAMPALA']

In [None]:
kampalaAdmissions

In [None]:
# Getting Admission numbers by Region: Central, Eastern, Western, Northern
# Some of the representative districts in different regions are;
central = {'Kampala', 'Wakiso', 'Mpigi', 'Mukono', 'Luwero'}

In [None]:
eastern = {'Iganga', 'Mayuge', 'Jinja', 'Bugiri'}

In [None]:
western = {'Mbarara', 'Rukugiri', 'Kabale', 'Kisoro'}

In [None]:
northern = {'Gulu', 'Amolatar', 'Adjumani', 'Apac', 'Arua', 'Kaabong', 'Kitgum'}

In [None]:
mak23[1]

# Combining more tables

In [None]:
studentData2 = mak23[1]

In [None]:
# The df columns need editing
studentData2

In [None]:
# delete "41" column
del studentData2['41']

In [None]:
xdata = dataTable2.columns

In [None]:
# Change columns by re-assigning
DataColumns = df.columns
DataColumns

In [None]:
# dataframe with the first entry as xdata
df2 = pd.DataFrame([xdata], columns=DataColumns)

In [None]:
df2

In [None]:
studentData2.columns = DataColumns

In [None]:
df2 = pd.concat([df2, studentData2], ignore_index=True)

In [None]:
df2

In [None]:
# concatenate d2 table and df.
pd.concat([df, df2], ignore_index=True)