# Python Code: Data Transformation (wide to long format)

The code here transforms the joined SAT data that now has the ID column so that it is normalized and ready for Loading into a Normalized database

**Input:** Sat_wide_format.txt

**Output:** SAT_long_format.txt (Tidy Data)

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

### Print files in directory

In [2]:
ls

[34m-archive[m[m/                          image.png
[31mSAT.txt[m[m*                           [31mlayout.xlsx[m[m*
SAT_clean.txt                      mysql_code_Data_Task.sql
SAT_wide_format.txt                python_code_Transform.ipynb
automated_QA_Report_Example.ipynb  python_code_initial_QA.ipynb
[31menrollment.txt[m[m*


In [3]:
# Import wide format SAT tab-delimited text file as dataframe
sat_wide_df = pd.read_table('SAT_wide_format.txt', sep='\t')

# View a sample of rows from SAT dataframe to be sure data read in correctly
sat_wide_df.sample(4)

Unnamed: 0,CO_CODE,DIST_CODE,SCH_CODE,TOTAL,MATHEMATICS,CRITICAL_READING,WRITING,SAT_1550,N_STUDENTS_SCORED,ID
95,7,2670,5,1335,452,433,450,18.6,313,72670005
269,27,5660,30,1655,565,545,545,65.9,625,275660030
329,37,2465,50,1514,528,501,485,40.6,959,372465050
343,39,1320,404,1143,410,356,377,4.0,10,391320404


In [4]:
# Select columns for final tranformation of wide format to long format
sat_wide = sat_wide_df[['ID','CO_CODE', 'TOTAL', 'MATHEMATICS', 
                        'CRITICAL_READING', 'WRITING', 'SAT_1550']].copy()

# Quick view of selected columns
sat_wide.head(5)

Unnamed: 0,ID,CO_CODE,TOTAL,MATHEMATICS,CRITICAL_READING,WRITING,SAT_1550
0,10110010,1,1320,461,429,430,23.5
1,10120010,1,1502,515,506,481,41.7
2,10590025,1,1418,486,468,464,31.1
3,11310005,1,1479,504,490,485,37.8
4,11790040,1,1477,508,486,483,35.7


In [5]:
# Unpivot the wide-format data so that it's melted into a Tidy Dataset ready for MySQL Load
sat_long_format = sat_wide.melt(id_vars='ID',
              value_vars=sat_wide.iloc[:,2:7], 
              var_name='DATA_TYPE',
              value_name='VALUE')

# View Tidy Data by sorting by ID and then DATA_TYPE (to match pdf handout)
sat_long_format.sort_values(by=['ID', 'DATA_TYPE']).head(6)

Unnamed: 0,ID,DATA_TYPE,VALUE
760,10110010,CRITICAL_READING,429.0
380,10110010,MATHEMATICS,461.0
1520,10110010,SAT_1550,23.5
0,10110010,TOTAL,1320.0
1140,10110010,WRITING,430.0
761,10120010,CRITICAL_READING,506.0


In [6]:
# Export cleaned SAT data as tab delimeted text file
sat_long_format.to_csv('SAT_long_format.txt', sep='\t', index=False)

## Print files in directory to be sure text file exported to project folder

In [7]:
ls

[34m-archive[m[m/                          [31menrollment.txt[m[m*
[31mSAT.txt[m[m*                           image.png
SAT_clean.txt                      [31mlayout.xlsx[m[m*
SAT_long_format.txt                mysql_code_Data_Task.sql
SAT_wide_format.txt                python_code_Transform.ipynb
automated_QA_Report_Example.ipynb  python_code_initial_QA.ipynb
