In [190]:
import pandas as pd
import csv
import os

In [191]:
#Activate the first option if whole column width is necessary and the second to make all columns visible
#pd.set_option('display.max_colwidth', -1)
#pd.set_option('display.max_columns', 999)

## I. Import the dataset and filter the relevant columns

In [192]:
#Get the report and store it into a dataframe. Check what columns are included in the file
report = pd.read_csv('1_PMNet_IWPReport.csv', dtype = str)
report.columns

Index(['Data Source', 'Snapshot Date', 'Area', 'IWP Number', 'ProjectName',
       '***Year***', 'Project Stage', 'Stage1aStart', 'Stage1aFinish',
       'Stage1bFinish', 'Stage1cFinish', 'Stage2a Finish', 'Stage2b Finish',
       'Stage2c Finish', 'Stage3a Finish', 'Stage3b Finish', 'Stage3c Finish',
       'Stage4a Finish', 'Stage4b Start', 'Stage4b Finish', 'Stage4c Finish',
       'Stage5a Finish', 'Stage5b Finish', 'Stage6 Finish',
       'Latest Works Cost', 'Budget Code', 'Budget Head', 'Delivery Type',
       'District', 'Town/Parish', 'Client Commissioning',
       '***Assistant team leader***', 'Group sponsor', 'Project Manager',
       'Team Leader', '***Assistant PM***', 'HertsHighways Comments', 'LTPF4',
       'WBSNo', 'Hermis Order No', 'Order Value', 'Risk',
       'Work Days To 4bStart', 'Work Days To 4bFinish',
       'Work Days Works Duration', 'Calendar Days To 4bStart',
       'Calendar Days To 4bFinish', 'Calendar Days Works Duration',
       'CCDivision', 'IWPCod

In [193]:
#Drop the irrelevant attributes, validated by an SME, and remove whitespaces
report.drop(columns = ['Data Source', 'Snapshot Date','Area', 
        'Budget Head', 'HertsHighways Comments', 'LTPF4',
       'WBSNo', 'Hermis Order No', 'Order Value', 'Risk',
       'Work Days To 4bStart', 'Work Days To 4bFinish',
       'Work Days Works Duration', 'Calendar Days To 4bStart',
       'Calendar Days To 4bFinish', 'Calendar Days Works Duration',
       'IWPCodeDescription', 'TrafficMngmntNRSWA'], inplace = True ) 

In [194]:
#Remove all whitespaces from column names
report.columns = [x.replace(" ", "") for x in report.columns]

In [195]:
#Validate with the SME that all the remaining columns are relevant
report.columns

Index(['IWPNumber', 'ProjectName', '***Year***', 'ProjectStage',
       'Stage1aStart', 'Stage1aFinish', 'Stage1bFinish', 'Stage1cFinish',
       'Stage2aFinish', 'Stage2bFinish', 'Stage2cFinish', 'Stage3aFinish',
       'Stage3bFinish', 'Stage3cFinish', 'Stage4aFinish', 'Stage4bStart',
       'Stage4bFinish', 'Stage4cFinish', 'Stage5aFinish', 'Stage5bFinish',
       'Stage6Finish', 'LatestWorksCost', 'BudgetCode', 'DeliveryType',
       'District', 'Town/Parish', 'ClientCommissioning',
       '***Assistantteamleader***', 'Groupsponsor', 'ProjectManager',
       'TeamLeader', '***AssistantPM***', 'CCDivision', 'IWPCode', 'UnitIDs',
       'USRNs'],
      dtype='object')

## II. Identify the key feature and format it to avoid empty cells and multiple values per row

In [196]:
#Drop all rows that are empty and drop all duplicate rows
report.dropna(axis=0, how='all', inplace=True)
report.drop_duplicates(inplace=True)

In [197]:
#Identify the key feature and split the cell if there are several values within one row
s = report['UnitIDs'].str.split(';').apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'UnitIDs'
del report['UnitIDs']
report_mod = report.join(s)
report_mod = report_mod.reset_index(drop=True)
report_mod.shape #Shape after the filtering

(65694, 36)

In [198]:
#Count the number of cells that are null within the key feature
report_mod['UnitIDs'].isnull().sum()

1934

In [199]:
#Delete the null rows for the key feature and test it. The result of this sum operation must be 0.
report_mod.dropna(axis=0, how='any', subset = ['UnitIDs'], inplace = True)
report_mod['UnitIDs'].isnull().sum()

0

In [200]:
report_mod.shape

(63760, 36)

## 3. Filter the possible values for the relevant attributes

#### Budget code

In [201]:
#Check the possible values for the relevant features and their count and sort them:
report_mod.groupby(['BudgetCode'],as_index = False).size().sort_values(ascending = False)

BudgetCode
CWY    30330
MEM     5463
ARP     4978
ITP     4818
LGH     4234
FWY     2639
NCM     2097
SAR     2022
DRN     1619
LMZ     1318
WRC      825
CSC      728
DIS      701
BRG      666
TIS      478
NFM      264
SAM      146
LRZ      107
COM       93
FSC       83
MAJ       60
DSC       40
EXW       21
GBS       16
GSC       12
CZZ        2
dtype: int64

In [202]:
#Drop the instances that are out of scope, categorized by feature
filter_list = ['BRG','FWY','COM','CZZ','EXW','FSC','GBS','GSC','ITP','LGH','LMZ','LRZ','MAJ','NFM','SAM','TIS']
report_mod = report_mod[~report_mod.BudgetCode.isin(filter_list)]
report_mod = report_mod.reset_index(drop=True)

In [203]:
#Check the remaining categories
report_mod.groupby(['BudgetCode'],as_index = False).size().sort_values(ascending = False)

BudgetCode
CWY    30330
MEM     5463
ARP     4978
NCM     2097
SAR     2022
DRN     1619
WRC      825
CSC      728
DIS      701
DSC       40
dtype: int64

In [204]:
report.shape

(44151, 35)

#### Stage Date

In [205]:
#Discard works that took place before 2007
#report_mod['***Year***'].fillna(value = '0', inplace = True)
#report_mod['***Year***'] = report_mod['***Year***'].astype(str).astype(int)
#report_mod.drop(report_mod[(report_mod['***Year***']<2007)].index, inplace = True)

In [206]:
#Evaluate how many rows are included that don't have a date at all
missing_date = ['Stage1aStart', 'Stage1aFinish',
       'Stage1bFinish', 'Stage1cFinish', 'Stage2aFinish', 'Stage2bFinish',
       'Stage2cFinish', 'Stage3aFinish', 'Stage3bFinish', 'Stage3cFinish',
       'Stage4aFinish','Stage4bStart', 'Stage4bFinish', 'Stage4cFinish',
       'Stage5aFinish', 'Stage5bFinish', 'Stage6Finish']
#dates = report_mod.filter(missing_date) #Just to filter a number of columns
report_mod[missing_date].isnull().apply(lambda x: all(x), axis=1).sum()

21773

In [207]:
#Drop instances that have no date associated to the project
report_mod.dropna(subset=missing_date, how='all', inplace=True)
report_mod.shape

(27030, 36)

In [208]:
#Check the current stage for the remaining instances:
report_mod.groupby(report_mod['ProjectStage'], as_index = False).size().sort_values(ascending = False)

ProjectStage
Z     13349
6      6443
4a     1917
9      1882
5      1454
4c      812
X       606
2a      193
H       148
2c       98
2b       48
4b       24
1a       23
1b       20
3a        2
dtype: int64

In [209]:
#Drop instances that are in stages up to 4a and that were cancelled (X)
filter_stage = ['4a','2a','2c','X','2b','1a','1b']
report_mod = report_mod[~report_mod.ProjectStage.isin(filter_stage)]
report_mod = report_mod.reset_index(drop=True)
report_mod.shape

(24125, 36)

## Fill the blanks for rest of the categories with "Undefined"

In [210]:
report_mod['LatestWorksCost'].fillna(value = '0', inplace = True)
report_mod.fillna(value = 'Undefined', inplace = True)

In [211]:
#Use this to filter a specific value within a column
#report_mod.loc[report_mod['BudgetCode'] == 'ITP'].head(5)

### Save to a new CSV file 

In [212]:
report_mod.to_csv(r'C:\Users\J FernandezGomez\Jupyter Notebooks\Cleaning_Data_Notebooks\1_PMNET_IWReport\PMNET_processed_v2.csv')

Questions:




*Finish with the rest of the attributes


Project Stage: X = Cancelled (Even if they have a date, it was a projection, H = On hold, Z = Closed or Complete, 9 = 

What do we do with NaN dates? Check if it corresponds to a similar IWP number (just a parent or pre-patching) and combine the dates. If none are to be found, we discard them.

Check the codes on the DB_Mapping and update the script

IWPCode: Wait for the table.

    