## Clean and normalize "metro_startup_ranking.csv" dataset

> Review through data and clean the set at the basic level before merging

### Import some useful libraries

In [1]:
import pandas as pd
import numpy as np
print("import completed")

import completed


### Import data from csv file

In [2]:
df1 = pd.read_csv(r"../data/metro_startup_ranking.csv")
df2 = pd.read_csv(r"../data/csvData.csv")

### Use external data to fix the data format ("csvData.csv" is a file for 2-letter state abbreviations) 

In [3]:
for x in range(40):
    tmp = df1.loc[x, "Metro Area States"].split("-")
    df1.loc[x, "Metro Area States"] = tmp[0]

In [4]:
df = pd.merge(
left = df1,
right = df2,
how = "inner",
left_on = "Metro Area States",
right_on = "State"
)

### Review data

**1. Review first ten row**

In [5]:
print(df.head(10))

   Metro Area Code                 Metro Area Name Metro Area Main City  \
0            12060  Atlanta-Sandy Springs-Marietta              Atlanta   
1            12420    Austin-Round Rock-San Marcos               Austin   
2            19100     Dallas-Fort Worth-Arlington               Dallas   
3            26420      Houston-Sugar Land-Baytown              Houston   
4            41700       San Antonio-New Braunfels           SanAntonio   
5            12580                Baltimore-Towson            Baltimore   
6            14460         Boston-Cambridge-Quincy               Boston   
7            16740    Charlotte-Gastonia-Rock Hill            Charlotte   
8            16980       Chicago-Joliet-Naperville              Chicago   
9            17140           Cincinnati-Middletown           Cincinnati   

  Metro Area States  Startup Rank           State Abbrev Code  
0           Georgia            13         Georgia    Ga.   GA  
1             Texas             1           Te

**2. Review basic info**

In [6]:
print(df.shape)
print(df.info())

(40, 8)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 40 entries, 0 to 39
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Metro Area Code       40 non-null     int64 
 1   Metro Area Name       40 non-null     object
 2   Metro Area Main City  40 non-null     object
 3   Metro Area States     40 non-null     object
 4   Startup Rank          40 non-null     int64 
 5   State                 40 non-null     object
 6   Abbrev                40 non-null     object
 7   Code                  40 non-null     object
dtypes: int64(2), object(6)
memory usage: 2.8+ KB
None


**3. Check for duplicated data** 

In [7]:
dup = df.duplicated()
duprow = df[dup]
print(duprow)

Empty DataFrame
Columns: [Metro Area Code, Metro Area Name, Metro Area Main City, Metro Area States, Startup Rank, State, Abbrev, Code]
Index: []


**4. Check for number of values in different columns manually**

In [8]:
print("Metro Area Code unique values:", len(df["Metro Area Code"].unique())) 
print("Metro Area Name unique values:", len(df["Metro Area Name"].unique())) 
print("Metro Area Main City unique values:", len(df["Metro Area Main City"].unique())) 
print("Metro Area States unique values:", len(df["Metro Area States"].unique())) 
print("Startup Rank unique values:", len(df["Startup Rank"].unique()))
print(df["Startup Rank"].min() ,df["Startup Rank"].max())

Metro Area Code unique values: 40
Metro Area Name unique values: 40
Metro Area Main City unique values: 40
Metro Area States unique values: 25
Startup Rank unique values: 40
1 40


**5. Drop unused columns**

In [9]:
df = df.drop(axis = 1, columns = ["Metro Area Code", "Metro Area States", "State", "Abbrev"])

**6. Sort the data in ascending order by "startup rank"**

In [10]:
df = df.sort_values(by = ["Startup Rank"], ascending = [True])
df = df.reset_index(drop = True)
df

Unnamed: 0,Metro Area Name,Metro Area Main City,Startup Rank,Code
0,Austin-Round Rock-San Marcos,Austin,1,TX
1,Miami-Fort Lauderdale-Pompano Beach,Miami,2,FL
2,San Jose-Sunnyvale-Santa Clara,SanJose,3,CA
3,Los Angeles-Long Beach-Santa Ana,LosAngeles,4,CA
4,Denver-Aurora-Broomfield,Denver,5,CO
5,San Francisco-Oakland-Fremont,SanFrancisco,6,CA
6,New York-Northern New Jersey-Long Island,NewYork,7,NY
7,Houston-Sugar Land-Baytown,Houston,8,TX
8,San Diego-Carlsbad-San Marcos,SanDiego,9,CA
9,San Antonio-New Braunfels,SanAntonio,10,TX


### City that has 75th percentile (use `quantile(0.25)` because rank is in ascending order)

In [11]:
tmp = df["Startup Rank"].quantile(0.25)
df = df.loc[df["Startup Rank"] < tmp]

In [None]:
df

### Export the data

In [13]:
df.to_csv("./dataset/metro_startup_rank.csv", index = False)
print("Export completed")

Export completed
