# Linear Regression in Python
## Do curfew styles during teen years have an effect on the total number of arrests?

The data:
* Population: U.S. youth ages 12 - 17 in 1997
* Data is oversampled for blacks
   * will make regression results for blacks--a smaller group--more precise than without oversampling
   * however, descriptive statistics won't accurately reflect population without weights to correct for oversampling
* Sample: NLSY97 - Universe: UNIVERSE: R <= 13 at end of prev year; lives with mother/mother figure or father/father figure; parent/ parent figure set limits; parent/s set curfew limits or parent/s and R jointly set curfew limits

Hypotheses:
* H1: There is a significant correlation between x and y.
* H0: There is no significant correlation between x and y.


### Importing and cleaning the data

In [118]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.sandbox.regression.predstd import wls_prediction_std

In [119]:
# Import data
path = 'C:/Users/Hillary/Data_Science/Curfew Project/curfew/curfew.csv'
df_curfew = pd.read_csv(path)

df_curfew.info()
df_curfew.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8984 entries, 0 to 8983
Data columns (total 20 columns):
E8033100    8984 non-null int64
E8043100    8984 non-null int64
E8043500    8984 non-null int64
R0000100    8984 non-null int64
R0070700    8984 non-null int64
R0343700    8984 non-null int64
R0344200    8984 non-null int64
R0344900    8984 non-null int64
R0360300    8984 non-null int64
R0360400    8984 non-null int64
R0360500    8984 non-null int64
R0365900    8984 non-null int64
R0366000    8984 non-null int64
R0536300    8984 non-null int64
R0536401    8984 non-null int64
R0536402    8984 non-null int64
R0648900    8984 non-null int64
R1204500    8984 non-null int64
R1235800    8984 non-null int64
R1482600    8984 non-null int64
dtypes: int64(20)
memory usage: 1.4 MB


Unnamed: 0,E8033100,E8043100,E8043500,R0000100,R0070700,R0343700,R0344200,R0344900,R0360300,R0360400,R0360500,R0365900,R0366000,R0536300,R0536401,R0536402,R0648900,R1204500,R1235800,R1482600
0,0,0,0,1,1,-4,-4,-4,0,0,0,0,-4,2,9,1981,0,-3,1,4
1,0,0,0,2,1,-4,-4,-4,0,0,0,0,-4,1,7,1982,-4,-4,1,2
2,0,0,0,3,1,1,2,1,1,0,0,0,-4,2,9,1983,0,63000,1,2
3,0,0,0,4,4,-4,-4,-4,1,1,0,0,-4,2,2,1981,0,11700,1,2
4,0,0,0,5,2,-4,-4,-4,1,1,0,0,-4,1,10,1982,0,-3,1,2


In [120]:
# Import answer key data
df_answers = pd.read_csv('curfew_answer_key.csv', names='A')

df_answers.head()

Unnamed: 0,A
0,"label define vlE8033100 0 ""0"" 1 ""1 TO 4"" 5 ""..."
1,label values E8033100 vlE8033100
2,"label define vlE8043100 0 ""0: No incarceration..."
3,label values E8043100 vlE8043100
4,"label define vlR0000100 0 ""0"" 1 ""1 TO 999"" 1..."


In [121]:
# function to delete unnecessary info
def clean_answers(row):
    if 'label define' in row:
        return row[15:]  
    else:
        return None

# run above function and drop null rows
df_answers = df_answers['A'].apply(clean_answers).dropna()

# split data into ID (question ID) and ANS (answer key)
df_answers = pd.DataFrame(df_answers.str.split(' ',1).tolist(), columns=['ID', 'ANS']) 

df_answers.head()

Unnamed: 0,ID,ANS
0,E8033100,"0 ""0"" 1 ""1 TO 4"" 5 ""5 TO 9"" 10 ""10 TO 14"" ..."
1,E8043100,"0 ""0: No incarcerations"" 1 ""1 TO 2: incarcera..."
2,R0000100,"0 ""0"" 1 ""1 TO 999"" 1000 ""1000 TO 1999"" 2000..."
3,R0070700,"1 ""1. Almost none (less than 10%)"" 2 ""2. Ab..."
4,R0343700,"1 ""PARENT OR PARENTS SET LIMITS"" 2 ""PARENTS L..."


In [122]:
df_questions = pd.read_csv('curfew-questions.csv')

df_questions.head()

Unnamed: 0,ID,VAR,Q
0,E8033100,ARREST_TOTNUM,TOTAL NUMBER OF ARRESTS
1,E8043100,INCARC_TOTNUM,TOTAL NUMBER OF INCARCERATIONS
2,E8043500,INCARC_TOTMONTHS,TOTAL MONTHS INCARCERATED
3,R0000100,PUBID,"PUBID, YOUTH CASE IDENTIFICATION CODE"
4,R0070700,YPRS-1000,PERCENT OF PEERS BELONG TO A GANG


In [123]:
# create Q&A dataframe
df_qa = df_answers.merge(df_questions, on='ID')

df_qa = df_qa.drop(columns=['VAR'])

df_qa

Unnamed: 0,ID,ANS,Q
0,E8033100,"0 ""0"" 1 ""1 TO 4"" 5 ""5 TO 9"" 10 ""10 TO 14"" ...",TOTAL NUMBER OF ARRESTS
1,E8043100,"0 ""0: No incarcerations"" 1 ""1 TO 2: incarcera...",TOTAL NUMBER OF INCARCERATIONS
2,R0000100,"0 ""0"" 1 ""1 TO 999"" 1000 ""1000 TO 1999"" 2000...","PUBID, YOUTH CASE IDENTIFICATION CODE"
3,R0070700,"1 ""1. Almost none (less than 10%)"" 2 ""2. Ab...",PERCENT OF PEERS BELONG TO A GANG
4,R0343700,"1 ""PARENT OR PARENTS SET LIMITS"" 2 ""PARENTS L...",WHO SETS RS CURFEW LIMITS?
5,R0344200,"0 ""0"" 1 ""1"" 2 ""2"" 3 ""3"" 4 ""4"" 5 ""5"" 6 ""6...",R BROKE LIMITS FOR CURFEW IN LAST 30 DAYS?
6,R0344900,"1 ""DISCUSS IT CALMLY WITH YOU"" 2 ""IGNORE IT, ...",WHAT WOULD PARENT(S) DO IF R BREAKS CURFEW?
7,R0360300,"0 ""No"" 1 ""Yes""",ANY GANG IN RS NEIGHBORHOOD OR SCHOOL
8,R0360400,"0 ""No"" 1 ""Yes""","ANY RS BROTHERS, SISTERS, FRIENDS IN A GANG?"
9,R0360500,"0 ""No"" 1 ""Yes""",R EVER BELONGED TO GANG?


In [124]:
# create variable names
var_list = ['ARREST_TOT', 'INCAR_TOT', 'PUBID', 'PERC_PEERS_GANG', 'WHO_SETS_CURF', 'BR_CURF_L30', 'CURF_ENFORCM', 'NEIGH_SCH_GANG', 'SIB_FR_GANG', 'EV_GANG', 'R_ARRESTED_EV', 'ARREST_TOT97', 'SEX', 'BIRTH_MO', 'HARD_TIMES97', 'HH_INCOME97', 'CV_SAMPLE_TYPE', 'RACE_ETH']

df_qa['VAR'] = var_list

df_qa.head()

Unnamed: 0,ID,ANS,Q,VAR
0,E8033100,"0 ""0"" 1 ""1 TO 4"" 5 ""5 TO 9"" 10 ""10 TO 14"" ...",TOTAL NUMBER OF ARRESTS,ARREST_TOT
1,E8043100,"0 ""0: No incarcerations"" 1 ""1 TO 2: incarcera...",TOTAL NUMBER OF INCARCERATIONS,INCAR_TOT
2,R0000100,"0 ""0"" 1 ""1 TO 999"" 1000 ""1000 TO 1999"" 2000...","PUBID, YOUTH CASE IDENTIFICATION CODE",PUBID
3,R0070700,"1 ""1. Almost none (less than 10%)"" 2 ""2. Ab...",PERCENT OF PEERS BELONG TO A GANG,PERC_PEERS_GANG
4,R0343700,"1 ""PARENT OR PARENTS SET LIMITS"" 2 ""PARENTS L...",WHO SETS RS CURFEW LIMITS?,WHO_SETS_CURF


In [125]:
# make dictionary of ID and VAR to rename
df_idvar = df_qa[['ID', 'VAR']]

# try:
dict_rename_vars = df_idvar.set_index('ID')['VAR'].T.to_dict()

dict_rename_vars

{'E8033100': 'ARREST_TOT',
 'E8043100': 'INCAR_TOT',
 'R0000100': 'PUBID',
 'R0070700': 'PERC_PEERS_GANG',
 'R0343700': 'WHO_SETS_CURF',
 'R0344200': 'BR_CURF_L30',
 'R0344900': 'CURF_ENFORCM',
 'R0360300': 'NEIGH_SCH_GANG',
 'R0360400': 'SIB_FR_GANG',
 'R0360500': 'EV_GANG',
 'R0365900': 'R_ARRESTED_EV',
 'R0366000': 'ARREST_TOT97',
 'R0536300': 'SEX',
 'R0536401': 'BIRTH_MO',
 'R0648900': 'HARD_TIMES97',
 'R1204500': 'HH_INCOME97',
 'R1235800': 'CV_SAMPLE_TYPE',
 'R1482600': 'RACE_ETH'}

In [126]:
# check for missing vars
rename_vars_set = set(list(dict_rename_vars.keys()))
column_vars_set = set(list(df_curfew.columns))

missed_vars = list(column_vars_set - rename_vars_set)

missed_vars

['R0536402', 'E8043500']

In [127]:
# use pd.dataframe.rename with dict to rename
df_curfew = df_curfew.rename(columns=dict_rename_vars)

df_curfew = df_curfew.rename(columns={'R0536402' : 'BIRTH_YR', 'E8043500': 'INCAR_TOT_MO'})

list(df_curfew.columns)

['ARREST_TOT',
 'INCAR_TOT',
 'INCAR_TOT_MO',
 'PUBID',
 'PERC_PEERS_GANG',
 'WHO_SETS_CURF',
 'BR_CURF_L30',
 'CURF_ENFORCM',
 'NEIGH_SCH_GANG',
 'SIB_FR_GANG',
 'EV_GANG',
 'R_ARRESTED_EV',
 'ARREST_TOT97',
 'SEX',
 'BIRTH_MO',
 'BIRTH_YR',
 'HARD_TIMES97',
 'HH_INCOME97',
 'CV_SAMPLE_TYPE',
 'RACE_ETH']

In [128]:
# make dict of data Q&A
df_qa = df_qa[['VAR', 'Q','ANS']]

dict_aq = df_qa.set_index('VAR').T.to_dict()

dict_aq['INCAR_TOT_MO'] = '-3 "invalid skip"  -4 "valid skip"  -5 "noninterview"'
dict_aq['BIRTH_YR'] = 'year listed is year born' 

dict_aq['ARREST_TOT']

{'Q': 'TOTAL NUMBER OF ARRESTS',
 'ANS': '0 "0"  1 "1 TO 4"  5 "5 TO 9"  10 "10 TO 14"  15 "15 TO 19"  20 "20 TO 24"  25 "25 TO 29"  30 "30 TO 34"  35 "35 TO 39"  40 "40 TO 44"  45 "45 TO 49"  50 "50 TO 99999999: 50+"'}

In [129]:
# create new dfs to clean
df_curfew1 = df_curfew
df_curfew2 = df_curfew

df_curfew1.head()

Unnamed: 0,ARREST_TOT,INCAR_TOT,INCAR_TOT_MO,PUBID,PERC_PEERS_GANG,WHO_SETS_CURF,BR_CURF_L30,CURF_ENFORCM,NEIGH_SCH_GANG,SIB_FR_GANG,EV_GANG,R_ARRESTED_EV,ARREST_TOT97,SEX,BIRTH_MO,BIRTH_YR,HARD_TIMES97,HH_INCOME97,CV_SAMPLE_TYPE,RACE_ETH
0,0,0,0,1,1,-4,-4,-4,0,0,0,0,-4,2,9,1981,0,-3,1,4
1,0,0,0,2,1,-4,-4,-4,0,0,0,0,-4,1,7,1982,-4,-4,1,2
2,0,0,0,3,1,1,2,1,1,0,0,0,-4,2,9,1983,0,63000,1,2
3,0,0,0,4,4,-4,-4,-4,1,1,0,0,-4,2,2,1981,0,11700,1,2
4,0,0,0,5,2,-4,-4,-4,1,1,0,0,-4,1,10,1982,0,-3,1,2


In [130]:
dict_aq['WHO_SETS_CURF']

{'Q': 'WHO SETS RS CURFEW LIMITS?',
 'ANS': '1 "PARENT OR PARENTS SET LIMITS"  2 "PARENTS LET ME DECIDE"  3 "MY PARENTS AND I DECIDE JOINTLY"'}

In [131]:
# df_curfew1 will be universe WHO_SETS_CURF
# first dataframe: delete all non-answer respondents for 'Who sets curfew?'
print(dict_aq['WHO_SETS_CURF'])

df_curfew1 = df_curfew1[df_curfew1.WHO_SETS_CURF > 0]

print(df_curfew1.info())
df_curfew1['WHO_SETS_CURF'].unique()

{'Q': 'WHO SETS RS CURFEW LIMITS?', 'ANS': '1 "PARENT OR PARENTS SET LIMITS"  2 "PARENTS LET ME DECIDE"  3 "MY PARENTS AND I DECIDE JOINTLY"'}
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3506 entries, 2 to 8980
Data columns (total 20 columns):
ARREST_TOT         3506 non-null int64
INCAR_TOT          3506 non-null int64
INCAR_TOT_MO       3506 non-null int64
PUBID              3506 non-null int64
PERC_PEERS_GANG    3506 non-null int64
WHO_SETS_CURF      3506 non-null int64
BR_CURF_L30        3506 non-null int64
CURF_ENFORCM       3506 non-null int64
NEIGH_SCH_GANG     3506 non-null int64
SIB_FR_GANG        3506 non-null int64
EV_GANG            3506 non-null int64
R_ARRESTED_EV      3506 non-null int64
ARREST_TOT97       3506 non-null int64
SEX                3506 non-null int64
BIRTH_MO           3506 non-null int64
BIRTH_YR           3506 non-null int64
HARD_TIMES97       3506 non-null int64
HH_INCOME97        3506 non-null int64
CV_SAMPLE_TYPE     3506 non-null int64
RACE_ETH 

array([1, 3, 2], dtype=int64)

In [132]:
# df_curfew2 will be universe CURF_ENFORCM
print(dict_aq['CURF_ENFORCM'])

df_curfew2 = df_curfew2[df_curfew2.CURF_ENFORCM > 0]

print(df_curfew2.info())
df_curfew2['CURF_ENFORCM'].unique()

{'Q': 'WHAT WOULD PARENT(S) DO IF R BREAKS CURFEW?', 'ANS': '1 "DISCUSS IT CALMLY WITH YOU"  2 "IGNORE IT, PRETEND THAT IT DIDN\'T HAPPEN OR LET YOU GET AWAY WITH IT"  3 "SULK, POUT, OR GIVE YOU THE SILENT TREATMENT"  4 "TAKE AWAY A PRIVILEGE, GROUND YOU, OR GIVE YOU A CHORE"  5 "MAKE THREATS THAT WON\'T BE KEPT"  6 "YELL, SHOUT, OR SCREAM AT YOU"  7 "USE PHYSICAL PUNISHMENT"'}
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3392 entries, 2 to 8980
Data columns (total 20 columns):
ARREST_TOT         3392 non-null int64
INCAR_TOT          3392 non-null int64
INCAR_TOT_MO       3392 non-null int64
PUBID              3392 non-null int64
PERC_PEERS_GANG    3392 non-null int64
WHO_SETS_CURF      3392 non-null int64
BR_CURF_L30        3392 non-null int64
CURF_ENFORCM       3392 non-null int64
NEIGH_SCH_GANG     3392 non-null int64
SIB_FR_GANG        3392 non-null int64
EV_GANG            3392 non-null int64
R_ARRESTED_EV      3392 non-null int64
ARREST_TOT97       3392 non-null int64
SEX  

array([1, 4, 6, 2, 3, 5, 7], dtype=int64)

In [133]:
# clean data for both (move down after/if picking one df over other)
#print(dict_aq['INCAR_TOT'])
#df_curfew['INCAR_TOT'].unique()
#df_curfew1.groupby('WHO_SETS_CURF')['PUBID'].nunique()
#---
# check various NaN values for each variable
df_curfew1[df_curfew1 < 0].apply(pd.Series.value_counts)

Unnamed: 0,ARREST_TOT,INCAR_TOT,INCAR_TOT_MO,PUBID,PERC_PEERS_GANG,WHO_SETS_CURF,BR_CURF_L30,CURF_ENFORCM,NEIGH_SCH_GANG,SIB_FR_GANG,EV_GANG,R_ARRESTED_EV,ARREST_TOT97,SEX,BIRTH_MO,BIRTH_YR,HARD_TIMES97,HH_INCOME97,CV_SAMPLE_TYPE,RACE_ETH
-4.0,,,,,2.0,,103.0,103.0,,,,,3368.0,,,,388.0,352.0,,
-3.0,,7.0,7.0,,,,,,,,,,,,,,,507.0,,
-2.0,,,,,65.0,,3.0,8.0,30.0,13.0,,,1.0,,,,4.0,,,
-1.0,,,,,,,3.0,3.0,1.0,1.0,1.0,5.0,,,,,3.0,,,


* INCAR_TOT - delete all < 0 (7 invalid skips)
* PERC_PEERS_GANG - delete -4 (valid skips); turn -2 (don't know) into 0's (0  - no or don't know)
* BR_CURF_L30 - drop -1 and -2 (refuse, don't know); -4 (valid skip) to 0 (can't break curfew if you don't have it)
* CURF_ENFORCM - drop -1 and -2; -4 to 0
* NEIGH_SCH_GANG, SIB_FR_GANG -  -2 to 0; drop -1 (refusal)
* EV_GANG, R_ARRESTED_EV drop -1
* ARREST_TOT97 - drop -2; -4 to 0 (didn't ask if answered "no" to "ever arrested?")
* HARD_TIMES97 - turn -4 and -2 to 0; -1 to drop 
* HH_INCOME97 - possible valid skips are b/c child not considered permanent member of HH; turn all negative to -1, drop all negative when using


In [134]:
# INCAR_TOT
df_curfew1 = df_curfew1[df_curfew1.INCAR_TOT >= 0]


# PERC_PEERS_GANG
# recoding negative (NaN) values
df_curfew1['PERC_PEERS_GANG'] = np.where(df_curfew1['PERC_PEERS_GANG'] == -2, 0, df_curfew1['PERC_PEERS_GANG'])
df_curfew1 = df_curfew1[df_curfew1.PERC_PEERS_GANG >= 0]

#recoding values for regression
df_curfew1.loc[df_curfew1.PERC_PEERS_GANG == 1, 'PERC_PEERS_GANG'] = 0.05
df_curfew1.loc[df_curfew1.PERC_PEERS_GANG == 2, 'PERC_PEERS_GANG'] = 0.25
df_curfew1.loc[df_curfew1.PERC_PEERS_GANG == 3, 'PERC_PEERS_GANG'] = 0.5
df_curfew1.loc[df_curfew1.PERC_PEERS_GANG == 4, 'PERC_PEERS_GANG'] = 0.75
df_curfew1.loc[df_curfew1.PERC_PEERS_GANG == 5, 'PERC_PEERS_GANG'] = 0.95


# BR_CURF_L30
df_curfew1['BR_CURF_L30'] = np.where(df_curfew1['BR_CURF_L30'] == -4, 0, df_curfew1['BR_CURF_L30'])
df_curfew1 = df_curfew1[df_curfew1.BR_CURF_L30 >= 0]

# CURF_ENFORCM 
df_curfew1['CURF_ENFORCM'] = np.where(df_curfew1['CURF_ENFORCM'] == -4, 0, df_curfew1['CURF_ENFORCM'])
df_curfew1 = df_curfew1[df_curfew1.CURF_ENFORCM >= 0]

# NEIGH_SCH_GANG
df_curfew1['NEIGH_SCH_GANG'] = np.where(df_curfew1['NEIGH_SCH_GANG'] == -2, 0, df_curfew1['NEIGH_SCH_GANG'])
df_curfew1 = df_curfew1[df_curfew1.NEIGH_SCH_GANG >= 0]

# SIB_FR_GANG
df_curfew1['SIB_FR_GANG'] = np.where(df_curfew1['SIB_FR_GANG'] == -2, 0, df_curfew1['SIB_FR_GANG'])
df_curfew1 = df_curfew1[df_curfew1.SIB_FR_GANG >= 0]

# EV_GANG
df_curfew1 = df_curfew1[df_curfew1.EV_GANG >= 0]

# R_ARRESTED_EV
df_curfew1 = df_curfew1[df_curfew1.R_ARRESTED_EV >= 0]

# ARREST_TOT97
df_curfew1['ARREST_TOT97'] = np.where(df_curfew1['ARREST_TOT97'] == -4, 0, df_curfew1['ARREST_TOT97'])
df_curfew1 = df_curfew1[df_curfew1.ARREST_TOT97 >= 0]

# HARD_TIMES97
df_curfew1['HARD_TIMES97'] = np.where(df_curfew1['HARD_TIMES97'] == -2 | -4, 0, df_curfew1['HARD_TIMES97'])
df_curfew1 = df_curfew1[df_curfew1.HARD_TIMES97 >= 0]

# HH_INCOME97 - drop negative R for this var when using in reg
df_curfew1['HH_INCOME97'] = np.where(df_curfew1['HH_INCOME97'] < 0, -1, df_curfew1['HH_INCOME97'])


In [154]:
print(dict_aq['CURF_ENFORCM'])
df_curfew1.CURF_ENFORCM.unique()

{'Q': 'WHAT WOULD PARENT(S) DO IF R BREAKS CURFEW?', 'ANS': '1 "DISCUSS IT CALMLY WITH YOU"  2 "IGNORE IT, PRETEND THAT IT DIDN\'T HAPPEN OR LET YOU GET AWAY WITH IT"  3 "SULK, POUT, OR GIVE YOU THE SILENT TREATMENT"  4 "TAKE AWAY A PRIVILEGE, GROUND YOU, OR GIVE YOU A CHORE"  5 "MAKE THREATS THAT WON\'T BE KEPT"  6 "YELL, SHOUT, OR SCREAM AT YOU"  7 "USE PHYSICAL PUNISHMENT"'}


array([1, 4, 6, 2, 3, 5, 0, 7], dtype=int64)

In [135]:
# create dummy variables for sex/race
df_curfew1.loc[df_curfew.RACE_ETH == 1, 'RACE_ETH'] = 'B'
df_curfew1.loc[df_curfew.RACE_ETH == 2, 'RACE_ETH'] = 'H'
df_curfew1.loc[df_curfew.RACE_ETH == 3, 'RACE_ETH'] = 'O'
df_curfew1.loc[df_curfew.RACE_ETH == 4, 'RACE_ETH'] = 'W'

df_curfew1.loc[df_curfew.SEX == 1, 'SEX'] = 'M'
df_curfew1.loc[df_curfew.SEX == 2, 'SEX'] = 'F'

# combine sex and race variables into one interacted variable
df_curfew1['SEX_RACE'] = df_curfew1['RACE_ETH'] + df_curfew1['SEX']

# create dummy variables and add to dataframe
race_sex_dummies = pd.get_dummies(df_curfew1.SEX_RACE)
df_curfew1 = pd.concat([df_curfew1, race_sex_dummies], axis=1)

# drop White Male to avoid dummy variable trap
df_curfew1.drop(['WM'], inplace=True, axis=1)

list(df_curfew1.columns)

Unnamed: 0,ARREST_TOT,INCAR_TOT,INCAR_TOT_MO,PUBID,PERC_PEERS_GANG,WHO_SETS_CURF,BR_CURF_L30,CURF_ENFORCM,NEIGH_SCH_GANG,SIB_FR_GANG,...,CV_SAMPLE_TYPE,RACE_ETH,SEX_RACE,BF,BM,HF,HM,OF,OM,WF
2,0,0,0,3,1,1,2,1,1,0,...,1,H,HF,0,0,1,0,0,0,0
6,2,0,0,7,1,1,0,4,0,0,...,1,H,HM,0,0,0,1,0,0,0
9,0,0,0,10,1,3,0,1,1,1,...,1,W,WM,0,0,0,0,0,0,0
18,1,1,18,19,1,1,0,6,0,0,...,1,B,BM,0,1,0,0,0,0,0
22,0,0,0,23,4,1,0,4,0,0,...,1,H,HF,0,0,1,0,0,0,0


In [141]:
list(df_curfew1.columns)

['ARREST_TOT',
 'INCAR_TOT',
 'INCAR_TOT_MO',
 'PUBID',
 'PERC_PEERS_GANG',
 'WHO_SETS_CURF',
 'BR_CURF_L30',
 'CURF_ENFORCM',
 'NEIGH_SCH_GANG',
 'SIB_FR_GANG',
 'EV_GANG',
 'R_ARRESTED_EV',
 'ARREST_TOT97',
 'SEX',
 'BIRTH_MO',
 'BIRTH_YR',
 'HARD_TIMES97',
 'HH_INCOME97',
 'CV_SAMPLE_TYPE',
 'RACE_ETH',
 'SEX_RACE',
 'BF',
 'BM',
 'HF',
 'HM',
 'OF',
 'OM',
 'WF']

In [144]:
df_curfew1.drop(['SEX', 'RACE_ETH', 'SEX_RACE', 'PUBID', 'CV_SAMPLE_TYPE'], inplace=True, axis=1)

df_curfew1.head()

Unnamed: 0,ARREST_TOT,INCAR_TOT,INCAR_TOT_MO,PERC_PEERS_GANG,WHO_SETS_CURF,BR_CURF_L30,CURF_ENFORCM,NEIGH_SCH_GANG,SIB_FR_GANG,EV_GANG,...,BIRTH_YR,HARD_TIMES97,HH_INCOME97,BF,BM,HF,HM,OF,OM,WF
2,0,0,0,1,1,2,1,1,0,0,...,1983,0,63000,0,0,1,0,0,0,0
6,2,0,0,1,1,0,4,0,0,0,...,1983,0,-1,0,0,0,1,0,0,0
9,0,0,0,1,3,0,1,1,1,0,...,1984,0,-1,0,0,0,0,0,0,0
18,1,1,18,1,1,0,6,0,0,0,...,1984,0,0,0,1,0,0,0,0,0
22,0,0,0,4,1,0,4,0,0,0,...,1983,0,-1,0,0,1,0,0,0,0


## Exploratory data analysis

In [145]:
# descriptive stats with note about weighting
df_curfew1.describe()

Unnamed: 0,ARREST_TOT,INCAR_TOT,INCAR_TOT_MO,PERC_PEERS_GANG,WHO_SETS_CURF,BR_CURF_L30,CURF_ENFORCM,NEIGH_SCH_GANG,SIB_FR_GANG,EV_GANG,...,BIRTH_YR,HARD_TIMES97,HH_INCOME97,BF,BM,HF,HM,OF,OM,WF
count,3091.0,3091.0,3091.0,3091.0,3091.0,3091.0,3091.0,3091.0,3091.0,3091.0,...,3091.0,3091.0,3091.0,3091.0,3091.0,3091.0,3091.0,3091.0,3091.0,3091.0
mean,1.292138,0.175348,2.669039,1.432223,1.643158,1.41087,2.828211,0.411841,0.177289,0.036881,...,1983.494662,0.054351,38736.848916,0.126173,0.124232,0.098997,0.101262,0.004206,0.002912,0.25364
std,3.45808,0.636272,14.265336,0.889083,0.919301,4.011077,1.798037,0.492246,0.381975,0.188501,...,0.500052,0.226746,41204.7458,0.332098,0.329899,0.298706,0.301724,0.064726,0.05389,0.435164
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1983.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1983.0,0.0,8765.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,1.0,1.0,0.0,4.0,0.0,0.0,0.0,...,1983.0,0.0,30800.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,2.0,3.0,1.0,4.0,1.0,0.0,0.0,...,1984.0,0.0,55484.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,62.0,7.0,207.0,5.0,3.0,30.0,7.0,1.0,1.0,1.0,...,1984.0,1.0,246474.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [146]:
df_curfew1.corr()

Unnamed: 0,ARREST_TOT,INCAR_TOT,INCAR_TOT_MO,PERC_PEERS_GANG,WHO_SETS_CURF,BR_CURF_L30,CURF_ENFORCM,NEIGH_SCH_GANG,SIB_FR_GANG,EV_GANG,...,BIRTH_YR,HARD_TIMES97,HH_INCOME97,BF,BM,HF,HM,OF,OM,WF
ARREST_TOT,1.0,0.606963,0.338203,0.076703,-0.008223,0.147759,0.00188,0.101163,0.137669,0.193473,...,-0.001812,0.032986,-0.093628,-0.077195,0.144341,-0.051818,0.060036,0.003184,0.00238,-0.098289
INCAR_TOT,0.606963,1.0,0.56104,0.050764,-0.023565,0.08614,0.014458,0.060738,0.091758,0.099864,...,0.022269,0.046078,-0.069579,-0.072574,0.14287,-0.052201,0.062568,0.01352,0.01342,-0.095226
INCAR_TOT_MO,0.338203,0.56104,1.0,0.077089,-0.024926,0.044887,0.015207,0.063061,0.074142,0.116226,...,-0.023453,0.012667,-0.060752,-0.06127,0.227555,-0.052079,0.022075,-0.002347,0.00378,-0.094387
PERC_PEERS_GANG,0.076703,0.050764,0.077089,1.0,-0.00129,0.083677,0.05618,0.363654,0.244087,0.173264,...,-0.050859,0.069649,-0.126939,0.088159,0.084988,0.074016,0.057562,0.013391,-0.006011,-0.111134
WHO_SETS_CURF,-0.008223,-0.023565,-0.024926,-0.00129,1.0,0.05329,-0.117567,-0.024129,0.024469,0.038621,...,-0.023505,-0.006289,0.066513,-0.082504,-0.065063,0.001407,-0.006193,0.008914,-0.024748,0.0621
BR_CURF_L30,0.147759,0.08614,0.044887,0.083677,0.05329,1.0,0.031015,0.100961,0.105579,0.107503,...,-0.001165,0.043046,-0.052747,-0.031399,0.038942,0.007367,0.060273,0.017026,-0.010028,-0.081046
CURF_ENFORCM,0.00188,0.014458,0.015207,0.05618,-0.117567,0.031015,1.0,0.080328,0.039176,0.022519,...,0.002039,0.005446,-0.019617,0.09647,-0.001109,-0.000863,-0.058597,0.034018,0.008504,0.02634
NEIGH_SCH_GANG,0.101163,0.060738,0.063061,0.363654,-0.024129,0.100961,0.080328,1.0,0.32928,0.150148,...,-0.057461,0.048742,-0.11474,0.048269,0.061486,0.083586,0.07211,-0.013752,0.00358,-0.088961
SIB_FR_GANG,0.137669,0.091758,0.074142,0.244087,0.024469,0.105579,0.039176,0.32928,1.0,0.219289,...,-0.047567,0.097954,-0.096832,0.055762,0.051161,0.095726,0.066012,0.009101,0.006358,-0.064238
EV_GANG,0.193473,0.099864,0.116226,0.173264,0.038621,0.107503,0.022519,0.150148,0.219289,1.0,...,-0.032244,0.013659,-0.029972,-0.012323,0.045992,-0.018885,0.042426,0.013807,0.021283,-0.039117


In [None]:
g = sns.jointplot(x='', y='ARREST_TOTNUM')

### Model

y = (test) arrest_totnum, incar_totnum, incarc_totmonths

x = sex/race dummies, who sets curfew limits dummies (OR parental enforcement method?!), # of times arrested, friends in gang? dummy, income last yr

In [None]:
df_curfew['YSAQ-248'].unique()

In [None]:
#new df
df_broke_curfew = df_curfew[df_curfew['YSAQ-248'] > 0]

df_broke_curfew.head()

In [None]:
#seaborn barplot

#ax = sns.barplot(x='YSAQ-248')


In [None]:
#matplotlib barplot



plt.bar(df_broke_curfew['YSAQ-248'], 2)

## Exploratory Data Analysis
