In [None]:
import pandas as pd

data = pd.read_csv(r'RawData/cvn-hvac-supply-data.csv') #35194 x 23. 14330 unique job_seq

# Variables Already Present in Jobs Dataset
<pre>
job_seq: Will be used for joining purposes
jcn
uic
work_center
date_maintenance_action
date_closing
issue_apl
issue_eic

<pre>

# Variables Not Present in Jobs Dataset
<pre>
demand_date
issue_date
cwt_hours
niin
niin_nomenclature
unit_of_issue
supply_pri
source_code
quantity
unit_price
total_price
<pre>

# Removing Unnecessary Columns

In [None]:
#Dimension is now 35194 x 20
data = data.drop(['document_number', 'request_num', 'requisition_number'], axis=1) 

# NA Handling

In [3]:
na_counts = data.isnull().sum()

print(f"{'Column Name':<40} | {'# of NAs'}")
print("-" * 40 + " | " + "-" * 8)

for col, count in na_counts.items():
    print(f"{col:<40} | {count}")

Column Name                              | # of NAs
---------------------------------------- | --------
jcn                                      | 0
uic                                      | 0
work_center                              | 0
jsn                                      | 0
date_maintenance_action                  | 0
date_closing                             | 849
document_number                          | 2646
request_num                              | 2646
requisition_number                       | 24708
demand_date                              | 571
issue_date                               | 0
cwt_hours                                | 571
issue_apl                                | 0
issue_eic                                | 2
niin                                     | 631
niin_nomenclature                        | 612
unit_of_issue                            | 0
supply_pri                               | 27
source_code                              | 2330
quantity         

In [None]:
#date_closing NA Handling 

#849 NAs
#NA Represents a job that is still open when the data was collected

data['date_closing'] = data['date_closing'] = pd.to_datetime(data['date_closing'], errors = 'coerce') #maintains NAs as NaT


In [None]:
#demand_date NA Handling

#Of the 14330 unique job_seq, there are only 265 NA demand date. when considering the 35194 rows, there are 571 NAs

data['demand_date'] = pd.to_datetime(data['demand_date'], errors = 'coerce') #maintains NAs as NaT

In [None]:
#issue_eic NA Handling 

#2 NAs. Replace with mode
data['issue_eic'] = data['issue_eic'].fillna('T200000')

In [10]:
#supply_pri NA Handling 

#27 NAs. Replace with mode

data['supply_pri'] = data['supply_pri'].fillna(4.0) #data type will be converted later

In [12]:
#source_code NA Handling 

#2330 NAs. Create new category'X' that will refer to NAs
data['source_code'] = data['source_code'].fillna('X')

In [None]:
#niin and niin_nomenclature NA Handling

#631 NAs in niin, 612 NAs in niin_nomenclature



# Converting Data Types

In [13]:
data['issue_date'] = pd.to_datetime(data['issue_date']) 

# Data Types Before and After Modifications

# Supply Wait Time (Issue Vs. Demand Date) Modifications

In [None]:
#Create a flag variable indicating that issue date comes before demand date

#(data['cwt_hours'] < 0).mean() #6.3% of cwt_hours are negative. This is likely due to a data entry error.

data['issue_before_demand'] = (data['issue_date'] < data['demand_date']) #will still have access to this info. Can subset to exclude these rows later if needed

In [None]:
#Create supply_days_waiting: Variable that calculates the number of days between issue date and demand date.

data['supply_days_waiting'] = abs(data['issue_date'] - data['demand_date']) #absolute value assumes the entries were entered in the wrong order
data['supply_days_waiting'] = data['supply_days_waiting'].dt.days #convert to days


In [None]:
#Handling Outliers for supply_days_waiting

#Create flag variable signaling a Supply Days Waiting Outlier

days_waiting_q1 = data['supply_days_waiting'].quantile(0.25)
days_waiting_q3 = data['supply_days_waiting'].quantile(0.75)
days_waiting_IQR = days_waiting_q3 - days_waiting_q1

data['supply_days_waiting_outlier'] = ((data['supply_days_waiting'] > days_waiting_q3 + 1.5 * days_waiting_IQR)) #Flag variable telling whether supply_days_waiting is an outlier
#days_waiting_outliers = data[(data['supply_days_waiting'] < days_waiting_q1 - 1.5 * days_waiting_IQR) | (data['supply_days_waiting'] > days_waiting_q3 + 1.5 * days_waiting_IQR)] 
#3.7k outliers 

# Notes on Dates For Unique Job Sequences

<pre>
Are days waiting the same for all parts associated with a job_seq or differ?

issue_date and demand_date can differ. There could be up to 20 unique issue_date and demand_date for a given job_seq

date_maintenance_action and date_closing never differ for job_seq
<pre>

In [None]:
# """
# Cell generated by Data Wrangler. #Shows distinct counts of dates for each job_seq
# """
# def clean_data(data):
#     # Performed 4 aggregations grouped on column: 'job_seq'
#     data = data.groupby(['job_seq']).agg(date_maintenance_action_nunique=('date_maintenance_action', 'nunique'), date_closing_nunique=('date_closing', 'nunique'), issue_date_nunique=('issue_date', 'nunique'), demand_date_nunique=('demand_date', 'nunique')).reset_index()
#     return data

# data_clean = clean_data(data.copy())
# data_clean.head()

# Next Steps

<pre>
niin and niin_nomenclature NA Handling
Convert Data Types
Make table showing previous and current data types for all columns
Save cleaned pkl file

Make sure there are no more NAs, outliers have been detected/handled, data types have been apropriately set, 
useless columns have been removed, supply_days_waiting is accurate, consider how unique job_seq's behave beyond looking at dates as done above

After cleaning, can move to supply EDA. 
Then, merge. Ideally, want each record to uniquely identify a maintenance action.
Many rows will not have supply data as there are 47k jobs records and 14k supply records


<pre>