## Clean and normalize "University_info.csv" dataset

> Review through data and clean the set at the basic level before merging

### Import some useful libraries

In [1]:
import pandas as pd
import numpy as np
print("import completed")

import completed


### Import the data from csv file

> the size of the original file is too huge for the "read_csv" function to read properly => set `low_memory`

In [2]:
df = pd.read_csv(r"../data/University_info.csv", low_memory = False)

### Review the data

> The file is too huge and contains unnecessary info => drop irrelevant columns before actually read the data

**1. Review first 5 rows**

In [3]:
print(df.head())

   UNITID    OPEID  opeid6                               INSTNM        CITY  \
0  100654   100200    1002             Alabama A & M University      Normal   
1  100663   105200    1052  University of Alabama at Birmingham  Birmingham   
2  100690  2503400   25034                   Amridge University  Montgomery   
3  100706   105500    1055  University of Alabama in Huntsville  Huntsville   
4  100724   100500    1005             Alabama State University  Montgomery   

  STABBR         ZIP                                       AccredAgency  \
0     AL       35762  Southern Association of Colleges and Schools C...   
1     AL  35294-0110  Southern Association of Colleges and Schools C...   
2     AL  36117-3553  Southern Association of Colleges and Schools C...   
3     AL       35899  Southern Association of Colleges and Schools C...   
4     AL  36104-0271  Southern Association of Colleges and Schools C...   

                          INSTURL  \
0                   www.aamu.edu/   


**2. Remove columns that contain only `null` values**

In [4]:
df = df.dropna(axis = 1, how = "all")

**3. Using "dictionary_university_info.csv", take necessary columns**

In [5]:
df = df[["INSTNM" ,"CITY", "STABBR", "LOCALE", "CIP11ASSOC", "CIP11BACHL", "WOMENONLY"]]
check = 0

### Review the actual data

**1.Read first 5 rows**

In [6]:
print(df.head())

                                INSTNM        CITY STABBR  LOCALE  CIP11ASSOC  \
0             Alabama A & M University      Normal     AL    12.0         0.0   
1  University of Alabama at Birmingham  Birmingham     AL    12.0         0.0   
2                   Amridge University  Montgomery     AL    12.0         2.0   
3  University of Alabama in Huntsville  Huntsville     AL    12.0         0.0   
4             Alabama State University  Montgomery     AL    12.0         0.0   

   CIP11BACHL  WOMENONLY  
0         2.0        0.0  
1         2.0        0.0  
2         2.0        0.0  
3         2.0        0.0  
4         2.0        0.0  


**2. Review numeric columns**

In [7]:
print(df.shape)
print(df.describe())

(7804, 7)
            LOCALE   CIP11ASSOC   CIP11BACHL    WOMENONLY
count  7380.000000  7383.000000  7383.000000  7383.000000
mean     19.589024     0.419206     0.367872     0.005824
std       9.380431     0.695182     0.673537     0.076099
min      11.000000     0.000000     0.000000     0.000000
25%      12.000000     0.000000     0.000000     0.000000
50%      21.000000     0.000000     0.000000     0.000000
75%      22.000000     1.000000     1.000000     0.000000
max      43.000000     2.000000     2.000000     1.000000


**3. Review basic info for all columns**

In [8]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7804 entries, 0 to 7803
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   INSTNM      7804 non-null   object 
 1   CITY        7804 non-null   object 
 2   STABBR      7804 non-null   object 
 3   LOCALE      7380 non-null   float64
 4   CIP11ASSOC  7383 non-null   float64
 5   CIP11BACHL  7383 non-null   float64
 6   WOMENONLY   7383 non-null   float64
dtypes: float64(4), object(3)
memory usage: 426.9+ KB
None


**4. Check for duplication**

In [9]:
dup = df.duplicated()
duprow = df[dup]
print(duprow)

                                                 INSTNM          CITY STABBR  \
1016             Cannella School of Hair Design-Chicago       Chicago     IL   
4485             Cannella School of Hair Design-Chicago       Chicago     IL   
5674                          Dewey University-Hato Rey      Hato Rey     PR   
6966  Trend Setters' Academy of Beauty Culture-Louis...    Louisville     KY   
7031                          Fortis College-Montgomery    Montgomery     AL   
7080                  Tulsa Welding School-Jacksonville  Jacksonville     FL   
7298                         Tulsa Welding School-Tulsa         Tulsa     OK   
7316             Southern Careers Institute-San Antonio   San Antonio     TX   

      LOCALE  CIP11ASSOC  CIP11BACHL  WOMENONLY  
1016    11.0         0.0         0.0        0.0  
4485    11.0         0.0         0.0        0.0  
5674    11.0         0.0         0.0        0.0  
6966    11.0         0.0         0.0        0.0  
7031    12.0         0.0     

- drop duplicated rows 

In [10]:
if(check == 0):
    df = df.drop(axis = 0, index = [1016, 4485, 5674, 6966, 7031, 7080, 7298, 7316])
    check = 1

- re-check for duplications

In [11]:
dup = df.duplicated()
duprow = df[dup]
print(duprow)

Empty DataFrame
Columns: [INSTNM, CITY, STABBR, LOCALE, CIP11ASSOC, CIP11BACHL, WOMENONLY]
Index: []


**5. Sort dataset due to requests**

In [12]:
sum_col = df["CIP11ASSOC"] + df["CIP11BACHL"]
df["CIP11EXT"] = sum_col
df = df.loc[(df["LOCALE"] < 14) & (df["WOMENONLY"] == 0) & (df["CIP11EXT"] != 0)]
df = df.reset_index(drop = True)
check2 = 0

In [13]:
if (check2 == 0):
    df = df.drop(axis = 1, columns = ["WOMENONLY", "CIP11EXT"])
    check2 = 1

**6. Use the same word formats for all table**

In [14]:
for x in range(1362):
    tmp = df.loc[x, "CITY"].split(" ")
    res = ""
    res = res.join(tmp)
    df.loc[x, "CITY"] = res
for x in range(1362):
    tmp = df.loc[x, "CITY"].split(".")
    res = ""
    res = res.join(tmp)
    df.loc[x, "CITY"] = res

### Double-check the data

**1. Read first 5 rows**

In [15]:
print(df.head())

                                INSTNM        CITY STABBR  LOCALE  CIP11ASSOC  \
0             Alabama A & M University      Normal     AL    12.0         0.0   
1  University of Alabama at Birmingham  Birmingham     AL    12.0         0.0   
2                   Amridge University  Montgomery     AL    12.0         2.0   
3  University of Alabama in Huntsville  Huntsville     AL    12.0         0.0   
4             Alabama State University  Montgomery     AL    12.0         0.0   

   CIP11BACHL  
0         2.0  
1         2.0  
2         2.0  
3         2.0  
4         2.0  


**2. View basic info**

In [16]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1362 entries, 0 to 1361
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   INSTNM      1362 non-null   object 
 1   CITY        1362 non-null   object 
 2   STABBR      1362 non-null   object 
 3   LOCALE      1362 non-null   float64
 4   CIP11ASSOC  1362 non-null   float64
 5   CIP11BACHL  1362 non-null   float64
dtypes: float64(3), object(3)
memory usage: 64.0+ KB
None


**3. Count null values**

In [17]:
print(df.isnull().sum())

INSTNM        0
CITY          0
STABBR        0
LOCALE        0
CIP11ASSOC    0
CIP11BACHL    0
dtype: int64


### Export the data

In [18]:
df.to_csv("./dataset/uni_info.csv", index = False)
print("Export completed")

Export completed
