In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

### read data file

In [2]:
dataframe = pd.read_csv("TSAClaims2002_2017.csv",encoding='cp1252')

#### columns of dataframe

In [4]:
dataframe.columns

Index(['Claim_Number', 'Date_Received', 'Incident_Date', 'Airport_Code',
       'Airport_Name', 'Claim_Type', 'Claim_Site', 'Item_Category',
       'Close_Amount', 'Disposition', 'StateName', 'State', 'County', 'City'],
      dtype='object')

In [5]:
dataframe.shape

(220855, 14)

### display first 5 rows of dataframe

In [3]:
dataframe.head()

Unnamed: 0,Claim_Number,Date_Received,Incident_Date,Airport_Code,Airport_Name,Claim_Type,Claim_Site,Item_Category,Close_Amount,Disposition,StateName,State,County,City
0,2006081611123,17027.0,17006.0,,,Passenger Property Loss,Checked Baggage,Candles - Decorative and other; Clothing - Sho...,,,,,,
1,2006062108380,16972.0,16957.0,,,Passenger Property Loss,Checkpoint,Jewelry - Fine,,,,,,
2,2006062008258,16972.0,16938.0,,,Passenger Property Loss,Checked Baggage,"Cosmetics - Perfume, toilet articles, medicine...",,,,,,
3,2006010699056,16831.0,16793.0,,,Passenger Property Loss,Checked Baggage,Other,,,,,,
4,2006032303625,16880.0,16861.0,,,Property Damage,Checked Baggage,Luggage (all types including footlockers),,,,,,


# Frequency distribution for Claim_Type, Claim_Site, Disposition

In [9]:
dataframe['Claim_Type'].value_counts(sort=False,normalize=True)

Passenger Theft                            0.002249
Wrongful Death                             0.000019
Passenger Property Loss                    0.595430
Passenger Property Loss/Personal Injury    0.000061
Passenger Property Loss/Personal Injur     0.000038
Personal Injury                            0.007575
Motor Vehicle                              0.001986
Property Damage/Personal Injury            0.000066
Complaint                                  0.000343
Employee Loss (MPCECA)                     0.002310
Property Damage                            0.387904
Missed Flight                              0.000150
Not Provided                               0.000009
-                                          0.001761
Compliment                                 0.000014
Bus Terminal                               0.000005
Property Loss                              0.000080
Name: Claim_Type, dtype: float64

In [10]:
dataframe['Claim_Site'].value_counts(sort=False,normalize=True)

Pre-Check          0.000036
Bus Station        0.000091
Other              0.013229
Motor Vehicle      0.002453
Checked Baggage    0.779139
Not Provided       0.000005
-                  0.001758
Checkpoint         0.203289
Name: Claim_Site, dtype: float64

In [11]:
dataframe['Disposition'].value_counts(sort=False,normalize=True)

Pending Payment            0.000005
Closed: Canceled           0.000806
Closed:Canceled            0.001372
Received                   0.000068
*Insufficient              0.008379
Settle                     0.166233
Approve in Full            0.236421
In Review                  0.043164
Deny                       0.480994
losed: Contractor Claim    0.000353
Closed:Contractor Claim    0.000555
-                          0.061650
Name: Disposition, dtype: float64

# summary of Frequency distribution for Claim_Type, Claim_Site, Disposition

# Data cleaning.

In [15]:
"""correcting mistaken entery values of Disposition"""
new = []
for i in dataframe['Disposition']:
    if i == '-':
        i = 'Unknown'
    elif i == 'Closed: Canceled':
        i = 'Closed:Canceled'
    elif i == 'losed: Contractor Claim':
        i = 'Closed:Contractor Claim'
    new.append(i)

In [16]:
dataframe['Disposition'] = new

In [17]:
new = []
for i in dataframe['Claim_Site']:
    if i == '-':
        i = 'Unknown'
    new.append(i)

In [18]:
dataframe["Claim_Site"] = new

In [19]:
new = []
for i in dataframe['Claim_Type']:
    if i == '-':
        i = 'Unknown'
    new.append(i)

In [20]:
dataframe["Claim_Type"] = new

In [21]:
for i in dataframe.columns:
    if i in ['Date_Received','Incident_Date','Close_Amount']:
         dataframe[i] = dataframe[i].fillna(0)
    elif i in ['Airport_Code','Airport_Name','Claim_Type','Claim_Site','Item_Category','Disposition','StateName','State']:
        dataframe[i] = dataframe[i].fillna('Unknown')
        

In [38]:
new = []
for i in dataframe["Claim_Type"]:
    if i == 'Passenger Property Loss/Personal Injur':
        i = 'Passenger Property Loss/Personal Injury'
    elif i == 'Property Loss':
        i = 'Passenger Property Loss'
    new.append(i)

In [39]:
dataframe["Claim_Type"] = new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


### droping duplicate records from dataframe cleaned one assigned to dataframe again.

In [26]:
dataframe = dataframe.drop_duplicates('Claim_Number')

In [27]:
dataframe.head()

Unnamed: 0,Claim_Number,Date_Received,Incident_Date,Airport_Code,Airport_Name,Claim_Type,Claim_Site,Item_Category,Close_Amount,Disposition,StateName,State,County,City
0,2006081611123,17027.0,17006.0,Unknown,Unknown,Passenger Property Loss,Checked Baggage,Candles - Decorative and other; Clothing - Sho...,0.0,Unknown,Unknown,Unknown,,
1,2006062108380,16972.0,16957.0,Unknown,Unknown,Passenger Property Loss,Checkpoint,Jewelry - Fine,0.0,Unknown,Unknown,Unknown,,
2,2006062008258,16972.0,16938.0,Unknown,Unknown,Passenger Property Loss,Checked Baggage,"Cosmetics - Perfume, toilet articles, medicine...",0.0,Unknown,Unknown,Unknown,,
3,2006010699056,16831.0,16793.0,Unknown,Unknown,Passenger Property Loss,Checked Baggage,Other,0.0,Unknown,Unknown,Unknown,,
4,2006032303625,16880.0,16861.0,Unknown,Unknown,Property Damage,Checked Baggage,Luggage (all types including footlockers),0.0,Unknown,Unknown,Unknown,,


## Frequency distribution for Claim_Type, Claim_Site, Disposition after data cleaing.

In [36]:
list1 = ['Claim_Type','Claim_Site','Disposition']
def value_counts():
    for i in list1:
        print(dataframe[i].value_counts())
        print("--------------------------------------------------")
        print("\n \n \n")

In [40]:
value_counts()

Passenger Property Loss                    126811
Property Damage                             82600
Unknown                                      8283
Personal Injury                              1597
Employee Loss (MPCECA)                        492
Passenger Theft                               479
Motor Vehicle                                 416
Complaint                                      73
Missed Flight                                  32
Passenger Property Loss/Personal Injury        21
Property Damage/Personal Injury                14
Wrongful Death                                  4
Compliment                                      3
Not Provided                                    2
Bus Terminal                                    1
Name: Claim_Type, dtype: int64
--------------------------------------------------

 
 

Checked Baggage    171497
Checkpoint          44737
Other                2912
Unknown              1121
Motor Vehicle         532
Bus Station            20
Pre-Ch