In [None]:
# import packages
# packages are used to access certain functions within the packages that perform specific things for us

# pandas is used to manipulate and clean DataFrames (tables)
import pandas as pd
# in this case, numpy is used to assign values, similar to CASE/WHEN in SQL
import numpy as np

In [None]:
### Pre-work Step 1
# This step specifies the column widths that you see in the "caltpa file layout.txt" file.
# Specify the column widths for use in pd.read_fwf() below.
# Need to start the index at 0.
col_widths = [
    (0,9), #1
    (9,17), #2
    (17,20), #3
    (20,21), #4
    (21,38), #5
    (38,48), #6
    (48,49), #7
    (49,50), #8
    (50,58), #9
    (58,59), #10
    (59,61), #11 
    (61,63), #12
    (63,65),
    (65,68),
    (68,71),
    (71,74),
    (74,77),
    (77,80),
    (80,83),
    (83,86),
    (86,89),
    (89,92),
    (92,93),
    (93,96),
    (96,98),
    (98,-1)
    ] 

In [None]:
### Pre-work Step 2
# Specify the column names for use in pd.read_fwf() below.
col_names = [
    'ID_Num', 
    'Reporting Date', 
    'Field Code', 
    'Blank_1', 
    'Last Name', 
    'First Name', 
    'Middle Initial', 
    'Blank_2', 
    'Birthdate',
    'Blank_3', 
    'Number of Attempts', 
    'Cycle Passing Status',
    'Cycle Total Score',
    'Rubric 1',
    'Rubric 2',
    'Rubric 3',
    'Rubric 4',
    'Rubric 5',
    'Rubric 6',
    'Rubric 7',
    'Rubric 8',
    'Rubric 9',
    'Blank_4',
    'Institution',
    'Preparation Program',
    'Field Specialty'
    ]

In [None]:
# Specify the file's location and name
file_location = 'Data/rc4321cycle20150302.asc'

In [None]:
# Read in the data using the function read_fwf, that reads in fixed-width-format data
df = pd.read_fwf(
          str(file_location),
          colspecs = col_widths,
          names = col_names,
          header = None,
          )

In [None]:
# View the head (top 5 rows) of the DataFrame. Note: there are only 2 rows in this DataFrame.
df.head()

In [None]:
# Drop blank columns
df.drop(
    ['Blank_1', 'Blank_2', 'Blank_3', 'Blank_4'],
    axis = 1,
    inplace = True
    )

In [None]:
# View the DataFrame
df.head()

In [None]:
# Change column types to string
df['Field Code'] = df['Field Code'].astype(str)

In [None]:
# Pad the zeros in "Field Code".
df['Field Code'] = df['Field Code'].str.zfill(3)

In [None]:
# Insert the campus code in the last column based on "Institution" using np.select().
# Reference: https://stackoverflow.com/questions/49228596/pandas-case-when-default-in-pandas
# 101 == 11 == Campus A
# 102 == 12 == Campus B
# 103 == 13 == Campus C
df['campusID'] = np.select(
    [
        df['Institution'] == 101,
        df['Institution'] == 102,
        df['Institution'] == 103,
    ],
    [
        11,
        12,
        13
    ],
    default = 0
)

In [None]:
# Print the unique values of 'campusID' to see if the np.select worked correctly
print(df['campusID'].unique())

In [None]:
# Show the head of the DataFrame for inspection.
df.head()

In [None]:
# Save the file to a .csv file
df.to_csv("Processed_CalTPA Data.csv", index = False)