# ACT data: import and cleaning

#### In this particular notebook the ACT dataset "act_2018.csv" is imported and cleaned for the porject's purpose.

In [54]:
import pandas as pd

In [55]:
act = pd.read_csv('../data/act_2018.csv')

In [56]:
type(act)

pandas.core.frame.DataFrame

In [57]:
act.head()

Unnamed: 0,State,Participation,Composite
0,Alabama,100%,19.1
1,Alaska,33%,20.8
2,Arizona,66%,19.2
3,Arkansas,100%,19.4
4,California,27%,22.7


In [58]:
act.shape

(52, 3)

In [59]:
act.columns

Index(['State', 'Participation', 'Composite'], dtype='object')

In [60]:
act.info()  #Dsiplay Dtypes to see if something must be modified

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   State          52 non-null     object 
 1   Participation  52 non-null     object 
 2   Composite      52 non-null     float64
dtypes: float64(1), object(2)
memory usage: 1.3+ KB


In [61]:
act.isnull().sum() # Checking for missing values

State            0
Participation    0
Composite        0
dtype: int64

In [62]:
# function to check if there is any obvious issue with the "Composite" column. ACT score ranges from 1 to 36.
def any_issues_data(data):
    issues = []
    for score in data:
        if score < 1 or score > 36:
            issues.append(score)
    return issues

In [63]:
any_issues_data(act['Composite']) #Running the function with the "Composite" column data.

[]

#### We can observe (via the empty list returned by the "any_issues_data" function) there are zero obviuos issues with the data we want to work with. 

In [64]:
act.columns = [col.replace(' ', '_').lower() for col in act.columns] #Editing the column names

In [65]:
act.columns

Index(['state', 'participation', 'composite'], dtype='object')

In [66]:
act['participation_%'] = act['participation'].str.strip('%').astype('float') #Create new column where participation rates are floats not strings

In [67]:
act.head()

Unnamed: 0,state,participation,composite,participation_%
0,Alabama,100%,19.1,100.0
1,Alaska,33%,20.8,33.0
2,Arizona,66%,19.2,66.0
3,Arkansas,100%,19.4,100.0
4,California,27%,22.7,27.0


In [68]:
#Deleting not needed columns
act.drop(columns=['participation'], inplace = True)

In [69]:
act.head()

Unnamed: 0,state,composite,participation_%
0,Alabama,19.1,100.0
1,Alaska,20.8,33.0
2,Arizona,19.2,66.0
3,Arkansas,19.4,100.0
4,California,22.7,27.0


#### Next, state names are changed so this dataset has the same column format as the climate dataset.

In [70]:
act['state'] = [state.replace(' ', '_').lower() for state in act['state']] #Modifying to match the format in the climate dataset state column

In [71]:
act.tail(3) #Just enough to check for every possible scenario (lowercased and spaces)

Unnamed: 0,state,composite,participation_%
49,west_virginia,20.3,65.0
50,wisconsin,20.5,100.0
51,wyoming,20.0,100.0


In [72]:
act.shape #Checking number of rows and columns to see if it matches with the number of rows and columns from SAT cleaned dataset

(52, 3)

In [73]:
sat = pd.read_csv('../data/sat_clean.csv') #importing the SAT cleaned dataset

In [74]:
sat.head(3)

Unnamed: 0.1,Unnamed: 0,state,total,participation_%
0,0,alabama,1166,6.0
1,1,alaska,1106,43.0
2,2,arizona,1149,29.0


In [75]:
sat.shape #Checking number of rows and columns

(51, 4)

#### ACT dataset has one more row than the SAT dataset.

In [76]:
def state_double(data):
    d = []
    double = []   #List of repeated states in "sate" column of the ACT dataset
    for state in data:
        if state in d:
            double.append(state)
        else:
            d.append(state)
    return double            

In [77]:
state_double(act['state'])

['maine']

#### Now we now that there is an extra row because 'maine' state is repeated. The next step will be to remove one of the rows.

In [78]:
i = act[act.state == 'maine'].index #Looking for the row index

In [79]:
i

Int64Index([19, 20], dtype='int64')

In [80]:
act.drop(i[0], inplace = True) #Drop one of the rows

In [81]:
act.shape

(51, 3)

In [82]:
act.sort_values('state', inplace = True)

In [83]:
act.to_csv('../data/act.csv') #Saving as csv file