In [1]:
#Import the essential Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Load dataset
df = pd.read_csv('./homeless.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,region,state,individuals,family_members,state_pop
0,0,East South Central,Alabama,2570,864,4887681
1,1,Pacific,Alaska,1434,582,735139
2,2,Mountain,Arizona,7259,2606,7158024
3,3,West South Central,Arkansas,2280,432,3009733
4,4,Pacific,California,109008,20964,39461588


In [3]:
#Check the number of rows and columns
df.shape

(51, 6)

In [4]:
#Check columns in dataset
df.columns

Index(['Unnamed: 0', 'region', 'state', 'individuals', 'family_members',
       'state_pop'],
      dtype='object')

In [5]:
#Check the data types of each column
df.dtypes

Unnamed: 0         int64
region            object
state             object
individuals        int64
family_members     int64
state_pop          int64
dtype: object

# Drop the Unnecessary Column 

In [6]:
#Dropping the "unnamed" column
df2 = df.drop("Unnamed: 0", axis=1)
df2.head()

Unnamed: 0,region,state,individuals,family_members,state_pop
0,East South Central,Alabama,2570,864,4887681
1,Pacific,Alaska,1434,582,735139
2,Mountain,Arizona,7259,2606,7158024
3,West South Central,Arkansas,2280,432,3009733
4,Pacific,California,109008,20964,39461588


# Check the Quality of the Dataset

In [7]:
#Checking for missing values
df2.isna().sum()

region            0
state             0
individuals       0
family_members    0
state_pop         0
dtype: int64

In [8]:
#Checking for duplicate rows
df2.duplicated().any()

False

In [19]:
df2.nunique()

region             9
state             51
individuals       49
family_members    50
state_pop         51
dtype: int64

# Descriptive Statistics

In [10]:
#Descriptive Statistics
df2.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
individuals,51.0,7225.784,15991.03,434.0,1446.5,3082.0,6781.5,109008.0
family_members,51.0,3504.882,7805.412,75.0,592.0,1482.0,3196.0,52070.0
state_pop,51.0,6405637.0,7327258.0,577601.0,1777413.5,4461153.0,7340946.5,39461588.0


# Insights

1. Which region has the highest number of homeless in the US?

In [11]:
#Dropping the other two column, "state" and "state_pop"
total_region=df2.drop(['state','state_pop'],axis=1)
total_region.head()

Unnamed: 0,region,individuals,family_members
0,East South Central,2570,864
1,Pacific,1434,582
2,Mountain,7259,2606
3,West South Central,2280,432
4,Pacific,109008,20964


In [12]:
#Making a new column name "total_homeless" which is the combination of column individuals and family_members
total_region['Total_Homeless'] = total_region['individuals'] + total_region['family_members']
total_region.groupby(['region']).sum().sort_values(by=['Total_Homeless'], ascending=False)

Unnamed: 0_level_0,individuals,family_members,Total_Homeless
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Pacific,142136,33162,175298
Mid-Atlantic,54038,60769,114807
South Atlantic,52260,23818,76078
East North Central,25406,14002,39408
Mountain,28491,9258,37749
West South Central,26842,8110,34952
New England,12903,17499,30402
West North Central,13971,8242,22213
East South Central,12468,3889,16357


2. Which region has the lowest number of homeless per thousand in the US?

In [13]:
#Dropping the other two column, "state" and "state_pop"
df6=df2.drop(['state'],axis=1)
#Making a new column name "total_homeless" which is the combination of column individuals and family_members
df6['Total_Homeless'] = df6['individuals'] + df6['family_members']
df6.head(10)

Unnamed: 0,region,individuals,family_members,state_pop,Total_Homeless
0,East South Central,2570,864,4887681,3434
1,Pacific,1434,582,735139,2016
2,Mountain,7259,2606,7158024,9865
3,West South Central,2280,432,3009733,2712
4,Pacific,109008,20964,39461588,129972
5,Mountain,7607,3250,5691287,10857
6,New England,2280,1696,3571520,3976
7,South Atlantic,708,374,965479,1082
8,South Atlantic,3770,3134,701547,6904
9,South Atlantic,21443,9587,21244317,31030


In [14]:
#Drop the unnecessary columns, adding the values in'Total_Homeless' after grouping, and sorting in ascending order
percent = df6.drop(['individuals', 'family_members'],axis=1)
percent_reg = percent.groupby(['region']).sum().sort_values(by=['Total_Homeless'], ascending=True)
#Creating a new column to find the number of homeless people per thousand in each region
percent_reg['Total_Homeless(per 1000)'] = ((percent_reg['Total_Homeless'])/percent_reg['state_pop'])*1000
percent_reg.sort_values(by=['Total_Homeless(per 1000)'], ascending=True)

Unnamed: 0_level_0,state_pop,Total_Homeless,Total_Homeless(per 1000)
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
East North Central,46886387,39408,0.8405
East South Central,19101485,16357,0.856321
West South Central,40238324,34952,0.868625
West North Central,21350241,22213,1.04041
South Atlantic,65229624,76078,1.166311
Mountain,24511745,37749,1.540037
New England,14829322,30402,2.050127
Mid-Atlantic,41217298,114807,2.785408
Pacific,53323075,175298,3.28747


While East South Central(16357 total homeless) has the lowest of number homeless people, East North Central(0.84 per thousand) has the lowest of number homeless people per thousand.

3. Which state has the highest number of homeless people who are not part of a family with children(per thousand)?

In [15]:
#Drop the unnecessary columns
percent_st = df2.drop(['region'],axis=1)
#Creating a new column 'individuals(per 1000)'
percent_st['individuals(per 1000)'] = ((percent_st['individuals'])/percent_st['state_pop'])*1000
percent_st.sort_values(by=['individuals(per 1000)'], ascending=False).head()

Unnamed: 0,state,individuals,family_members,state_pop,individuals(per 1000)
8,District of Columbia,3770,3134,701547,5.373838
11,Hawaii,4131,2399,1420593,2.907941
4,California,109008,20964,39461588,2.762382
37,Oregon,11139,3337,4181886,2.663631
28,Nevada,7058,486,3027341,2.331419


4. Which state has the lowest number of homeless people who are part of a family with children(per thousand)?

In [16]:
#Creating a new column 'family_members(per 1000)'
percent_st['family_members(per 1000)'] = ((percent_st['family_members'])/percent_st['state_pop'])*1000
percent_st.sort_values(by=['family_members(per 1000)'], ascending=True).head()

Unnamed: 0,state,individuals,family_members,state_pop,individuals(per 1000),family_members(per 1000)
34,North Dakota,467,75,758080,0.61603,0.098934
24,Mississippi,1024,328,2981020,0.343507,0.110029
18,Louisiana,2540,519,4659690,0.545101,0.111381
48,West Virginia,1021,222,1804291,0.565873,0.12304
3,Arkansas,2280,432,3009733,0.757542,0.143534
