In [1]:
# First review of driving data from 340742548_T_NPTS_PERSON_1995.csv
# Import all needed libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# Import driving data file

driving_data = pd.read_csv('./340742548_T_NPTS_PERSON_1995.csv')

In [15]:
# Get the shape of the data

driving_data.shape

(95360, 179)

In [7]:
# Get the column information

driving_data.columns

Index(['HOUSEID', 'PERSONID', 'PROXY', 'R_AGEFLG', 'REF_ROST', 'R_AGE',
       'R_SEX', 'R_RELAT', 'DRIVER', 'DTCONJ',
       ...
       'WTINDMAN', 'WTINDTRN', 'WTINDWHL', 'WTINDRET', 'WTINDFIN', 'WTINDSVC',
       'WORKLOC', 'YEARMIL2', 'YEAR', 'HHCMSA_NM'],
      dtype='object', length=179)

In [8]:
# Print the first 5 rows

print(driving_data.head())

   HOUSEID  PERSONID  PROXY  R_AGEFLG  REF_ROST  R_AGE  R_SEX  R_RELAT  \
0  1000371         1      2         2         1     33      2        1   
1  1000389         1      2         2         1     52      1        1   
2  1000439         1      2         2         1     26      2        1   
3  1000439         3      1         2         1      7      2        3   
4  1000439         2      1         2         1      9      2        3   

   DRIVER  DTCONJ      ...        WTINDMAN  WTINDTRN  WTINDWHL  WTINDRET  \
0       1       3      ...              30         0         0        20   
1       1       3      ...              80         0         0        10   
2       1       3      ...             994       994       994       994   
3       2      94      ...             994       994       994       994   
4       2      94      ...             994       994       994       994   

   WTINDFIN  WTINDSVC  WORKLOC  YEARMIL2  YEAR      HHCMSA_NM  
0         0        30        3    

In [9]:
# Get some basic statistical information about the age of the drivers

driving_data.R_AGE.describe()

count    95360.000000
mean        38.462060
std         20.626037
min          5.000000
25%         22.000000
50%         38.000000
75%         53.000000
max         88.000000
Name: R_AGE, dtype: float64

In [11]:
# Get some basic statistical information about the miles driven in a year

driving_data.YEARMILE.describe()

count     95360.000000
mean     319399.831449
std      457238.852565
min           0.000000
25%        7000.000000
50%       15000.000000
75%      999994.000000
max      999999.000000
Name: YEARMILE, dtype: float64

In [13]:
# Sort the data by miles driven in a year, in descending order

driving_data.sort_values(by = 'YEARMILE', ascending = False)

Unnamed: 0,HOUSEID,PERSONID,PROXY,R_AGEFLG,REF_ROST,R_AGE,R_SEX,R_RELAT,DRIVER,DTCONJ,...,WTINDMAN,WTINDTRN,WTINDWHL,WTINDRET,WTINDFIN,WTINDSVC,WORKLOC,YEARMIL2,YEAR,HHCMSA_NM
86213,11221470,2,1,2,1,72,2,2,1,94,...,994,994,994,994,994,994,94,999999,1995,Not in a CMSA
88015,11270824,3,1,2,1,16,2,3,1,94,...,994,994,994,994,994,994,94,999999,1995,Not in a CMSA
71173,10004307,1,2,2,1,33,2,1,1,3,...,998,998,998,998,998,998,3,999999,1995,"New York-North. Nj-Long Island, Ny-Nj-Ct"
86973,11242260,1,2,2,1,82,1,1,1,99,...,994,994,994,994,994,994,94,999999,1995,Not in a CMSA
58947,8084352,2,2,2,1,48,2,3,1,1,...,0,20,0,10,0,30,3,999999,1995,Not in a CMSA
27022,3209244,1,1,2,1,40,1,1,1,94,...,998,998,998,998,998,998,3,999999,1995,Not in a CMSA
1435,1044296,2,1,2,1,33,1,2,1,94,...,0,0,0,20,10,20,3,999999,1995,Not in a CMSA
87664,11262300,1,2,2,1,29,1,1,1,99,...,998,998,998,998,998,998,94,999999,1995,Not in a CMSA
22891,2426047,1,2,2,1,51,1,1,1,99,...,20,30,10,20,0,10,3,999999,1995,Not in a CMSA
27803,3231644,2,2,2,1,53,2,2,1,2,...,0,0,0,20,10,50,3,999999,1995,"New York-North. Nj-Long Island, Ny-Nj-Ct"


In [14]:
# Get the value counts for the miles driven in a year

driving_data.YEARMILE.value_counts()

999994    25194
10000      7395
15000      6174
12000      5364
0          4580
20000      4536
999998     4422
5000       3348
8000       2572
6000       2043
30000      2018
25000      1991
3000       1832
1000       1712
2000       1712
7000       1583
4000       1323
18000      1090
9000        890
500         819
14000       813
13000       774
40000       731
11000       624
16000       618
35000       594
1500        552
50000       517
100         453
200         440
          ...  
2280          1
69000         1
3350          1
3625          1
114400        1
12050         1
16400         1
24            1
4250          1
470           1
60980         1
121000        1
41600         1
275           1
14740         1
23200         1
7120          1
137000        1
18210         1
13100         1
980           1
298           1
9360          1
14893         1
11450         1
21            1
85            1
490           1
3668          1
31600         1
Name: YEARMILE, dtype: i