# Data Manipulation and Cleaning with Python

In [None]:
# Import the necessary packages
import pandas as pd
import numpy as np
from datetime import datetime
import missingno as msno

## SECTION 1: Reading and accessing data

### Read the survey response data

To load data from a CSV file see the code below. The text file is in the same folder as this notebook.

In [None]:
# Load the data
data = pd.read_csv('class_survey_data.csv')
# To check the first few enteries of a dataframe
data.head(5)

For informationon how to save data to a text file, see here: https://towardsdatascience.com/how-to-export-pandas-dataframe-to-csv-2038e43d9c03 and check the documentation for various setting you can change for the save process https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html

To count the number of rows and columns:

In [None]:
print(f"Number of rows: {len(data)}")
print(f"Number of columns: {len(data.columns)}")

You can also get the dimensions directly by using:

In [None]:
# Number of rows and columns as a pair of values
data.shape

### Rename the columns

In [None]:
# Let's show all column names
data.columns

let's rename the columns as follows:

In [None]:
data.rename(columns={'Timestamp': 'TIMESTAMP',
                     'Group Name (use DNA if none)': 'GROUP_NAME',
                     'What main industry have you worked in?': 'BACKGROUND_INDUSTRY',
                     'How many years professional experience do you have?': 'BACKGROUND_YEARS_PROFESSIONAL',
                     'How many years programming experience do you have?': 'BACKGROUND_YEARS_PROGRAMMING',
                     'What key experiences do you have?': 'BACKGROUND_SKILLS',
                     'Data management': 'IMPORT_DATA_MANAGEMENT',
                     'Statistics': 'IMPORT_STATISTICS',
                     'Visualisation': 'IMPORT_VISUALISATION',
                     'Machine Learning & Data Mining': 'IMPORT_MACHINE_LEARNING',
                     'Software Engineering': 'IMPORT_SOFTWARE_ENGINEERING',
                     'Communication': 'IMPORT_COMMUNICATION',
                     'How would you define Data Science in one sentence?': 'GOALS_DEFINITION',
                     'What key skills do you want to develop?': 'GOALS_SKILLS',
                     'What kind of role would you like to go into?': 'GOALS_ROLE',
                     'What industry would you like to go into?': 'GOALS_INDUSTRY'},
           inplace = True)

In [None]:
# Check the data
data.head()

### Access a single Column

To access a single column use square brackets with the column name.

In [None]:
data["GROUP_NAME"]

In [None]:
# Check the unique values that appear in this column
data["GROUP_NAME"].value_counts()

### Drop a Column

To delete a column or a row, use drop() as described here https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html 

In [None]:
data.drop(['GROUP_NAME'], axis=1, inplace = True)

In [None]:
# Check the columns
data.columns

### Access a single Row

To extract a single row, you can use `loc()` or `iloc()`, and supply a row index number. An overview of the differences between these two approaches is here https://towardsdatascience.com/how-to-use-loc-and-iloc-for-selecting-data-in-pandas-bd09cb4c3d79<br>
As a brief summary the format for `loc()` is:<br>
    `loc(row_label/s, column label/s)` <br>
If you want to select all rows or all columns, use a colon `:` for the row or columns. Take note of the location f the `:` inside `loc()` in the remaining examples below.

In [None]:
data.loc[20, :]

### Access a Single Cell

The first method is to access via the index name and column name using `.loc`

In [None]:
data.loc[20, "BACKGROUND_INDUSTRY"]

The second method is to access via the row and column indexes using `.iloc`

In [None]:
data.iloc[20, 1]

You can also intermix index and column name as follows:

In [None]:
data.loc[data.index[20], 'BACKGROUND_INDUSTRY']

### Selection Test

You can use basic evaluations when extracting data from a `dataframe`, using the comparison operators such as `>` `<=` etc

In [None]:
# Display data for survey respondents who gave a score of 2 or less for the importance of statistics
data.loc[data["IMPORT_STATISTICS"] <= 2, :]

### Slicing a Dataframe

You can also slice `dataframes` using the same approach as with strings, excep that there to slice whole rows out of the `dataframe`

In [None]:
# SLICE ROWS - show data between the 4th and 6th respondents
print(data.iloc[3:6, :])     # slice the dataframe by rows

In [None]:
# SLICE COLUMNS
data.loc[:, "GOALS_DEFINITION":"GOALS_INDUSTRY"]     # slice the dataframe by columns

### Sort All Rows by One Column (Values)

In [None]:
# sort the columns
data.sort_values("IMPORT_STATISTICS", axis = 0) # sort by the 'IMPORT_STATISTICS' cloumn

### EXERCISE: What is the third respondent's rating for communication as integer value?

In [None]:
# TODO: replace the content of this cell with your Python solution
raise NotImplementedError

## SECTION 4: Cleaning the data

### Formatting the column values

`Counter` from the `collections` module is useful for quickly calculating frequencies. 

In [None]:
# Check  the data type of all columns
data.dtypes

In [None]:
#  Print the unique values of all columns
for column in data.columns:
    print(column)
    print(data[column].value_counts()[:10])
    print('\n')

### TIMESTAMP

In [None]:
# Convert the timestamp column to datetime in format DD-MM-YYYY HH:MM:SS. 
# There are up to 6 datetime formats: 
# '%m/%d/%Y %H:%M:%S', '%m/%d/%Y %H:%M', '%d/%m/%Y %H:%M:%S', '%d/%m/%Y %H:%M', '%d/%m/%Y %H:%M:%S' and '%d/%m/%Y %H:%M'
# We will use a for loop to iterate over the rows of the dataframe and address each possible format

# Create a list of datatime formats to try
US_formats = ['%m/%d/%Y %H:%M:%S', '%m/%d/%Y %H:%M', '%d/%m/%Y %H:%M:%S', '%d/%m/%Y %H:%M']
UK_formats = ['%d/%m/%Y %H:%M:%S', '%d/%m/%Y %H:%M']
format_to_use = '%d/%m/%Y %H:%M:%S'

# Iterate over the rows then formats and do the format conversion
for i, row in data.iterrows():
    if i < 106: # The first 106 rows are in US format
        for format in US_formats:
            try:
                data.loc[i, 'TIMESTAMP'] = datetime.strptime(row['TIMESTAMP'], format).strftime(format_to_use)
                # Convert the column to format_to_use
                data.loc[i, 'TIMESTAMP'] = datetime.strptime(row['TIMESTAMP'], format_to_use)  
            except:
                pass
    else: # The rest of the rows are in UK format
        for format in UK_formats:
            try:
                data.loc[i, 'TIMESTAMP'] = datetime.strptime(row['TIMESTAMP'], format).strftime(format_to_use)
                # Convert the column to format_to_use
                data.loc[i, 'TIMESTAMP'] = datetime.strptime(row['TIMESTAMP'], format_to_use)  
            except:
                pass

# Check the TIMESTAMP column
data['TIMESTAMP'].head(10) 

### BACKGROUND_INDUSTRY

In [None]:
# Encode any value not in the list of values as 'Other'
allowed_values = ['Education and Training', 'Financial and Insurance Services', 'Information Technology',
                  'Manufacturing', 'Media and Communications', 'Mining', 'Retail', 'ScientiIc Research',
                  'Utilities']

for i, row in data.iterrows():
    for column in ['BACKGROUND_INDUSTRY', 'GOALS_INDUSTRY']:
        if row[column] not in allowed_values:
            data.loc[i, column] = 'Other'
        if row[column] == 'ScientiIc Research':
            data.loc[i, column] = 'Scientific Research'

# Check the values in the BACKGROUND_INDUSTRY column
print('BACKGROUND_INDUSTRY')
print('-------------------')
print(data['BACKGROUND_INDUSTRY'].value_counts())
print('\n')

# Check the values in the BACKGROUND_INDUSTRY column
print('GOALS_INDUSTRY')
print('-------------------')
print(data['GOALS_INDUSTRY'].value_counts())

### BACKGROUND_YEARS_PROFESSIONAL and BACKGROUND_YEARS_PROGRAMMING

In [None]:
data['BACKGROUND_YEARS_PROFESSIONAL'].value_counts()

In [None]:
def clean_exp_columns(data, columns, default_value):
    
    '''This function cleans the values in the two experience columns
    
    Inputs:
    data: the dataframe
    columns: the columns to be cleaned
    default_value: the value to be used for the default value

    Output:
    data: the dataframe with the cleaned column
    ''' 

    special_values= {'1 year' : 1.0, 
                     '2years' : 2.0, 
                     '2 years': 2.0,
                     '2 years accountant': 2.0, 
                     '3 years': 3.0,
                     "3year's in retail and Education area but not related this subject.": 3.0,
                     '6 years': 6.0,
                     '8 years': 8.0,
                     'no professional experience': 0.0,
                     'none': 0.0,
                     'Ten' : 10.0,
                     '12+' : 12.0, 
                     'Half a year': 0.5,
                     '0.5 year': 0.5,
                     '4 months': 1/3, 
                     '6 months': 0.5, 
                     '6MONTHS': 0.5, 
                     '3 Months': 0.25, 
                     '1 month': 1/12}
    
    for i, row in data.iterrows():
        for column in columns:
            old_value = row[column] 
            if old_value in special_values.keys():
                new_value = special_values[old_value]
            else:
                try:
                    new_value = float(old_value)
                except:
                    new_value = default_value
            data.loc[i, column] = new_value

    return data

In [None]:
DEFAULT_VALUE = np.nan
data = clean_exp_columns(data, ['BACKGROUND_YEARS_PROFESSIONAL', 'BACKGROUND_YEARS_PROGRAMMING'], DEFAULT_VALUE)

In [None]:
data['BACKGROUND_YEARS_PROFESSIONAL'].value_counts()

In [None]:
data['BACKGROUND_YEARS_PROGRAMMING'].value_counts()

## *STOP PLEASE. THE FOLLOWING IS FOR THE NEXT EXERCISE. THANKS.*


## SECTION 3: Calculating descriptive statistics

### Counting data

`value_counts()` method in pandas is useful for quickly calculating frequencies. 

In [None]:
print("Distribution of communication importance ratings:")
print("-------------------------------------------------")
print(data['IMPORT_COMMUNICATION'].value_counts())

### TODO: Calculate distribution of background and goal industries

In [None]:
### TO DO: replace the content of this cell with your Python solution
raise NotImplementedError

### Calculating the mode

We can also use `Counter` to calculate the mode.

In [None]:
# First calculate the frequencies (here we also sort the values)
Communication_freqs = data['IMPORT_COMMUNICATION'].value_counts()
Communication_freqs

In [None]:
# Then calculate the mode by selecting the index of the maximum value
print("Communication mode:", Communication_freqs.idxmax())

We probably will need to calculate the _mode_ more than once. It hence is a good idea to define this as our a local function which we can later call with different parameters, depending on which data we would like to calculate the mode for, and without the need to repeat all its code again and again.

In Python, one defines a local function with the **def** statement, followed by the function name and a list of arguments with which we can invoke a function later.

Our own 'mode' function is introduced and used as follows:

In [None]:
# defines a new 'mode' function
def mode(data, column):
    val_freqs = data[column].value_counts()
    return val_freqs.idxmax()

# example on how to use the 'mode' function
print("Communication mode:", mode(data, 'IMPORT_COMMUNICATION'))

### TODO: Calculate the mode of background and goal industries

In [None]:
# TO DO: replace the content of this cell with your Python solution
raise NotImplementedError

## *STOP PLEASE. THE FOLLOWING IS FOR THE NEXT EXERCISE. THANKS.*


### Statistics with `numpy`

We can calculate other descriptive statistics. `numpy` includes routines for measures of centrality and dispersion. Below we calculate descriptive statistics for professional and programming experience.

Further detail: http://docs.scipy.org/doc/numpy/reference/routines.statistics.html

In [None]:
# Check data types of all columns
data.dtypes

We need to convert the BACKGROUND_YEARS_PROFESSIONAL and BACKGROUND_YEARS_PROGRAMMING columns to float before we can calculate the descriptive statistics of these columns

In [None]:
# Convert the BACKGROUND_YEARS_PROFESSIONAL and BACKGROUND_YEARS_PROGRAMMING columns to float
data['BACKGROUND_YEARS_PROFESSIONAL'] = data['BACKGROUND_YEARS_PROFESSIONAL'].astype(float)
data['BACKGROUND_YEARS_PROGRAMMING'] = data['BACKGROUND_YEARS_PROGRAMMING'].astype(float)

In [None]:
# Now calculate the min, max, range, mean, standard deviation, median, 25th and 75th percentiles, and IQR
for column in ['BACKGROUND_YEARS_PROFESSIONAL', 'BACKGROUND_YEARS_PROGRAMMING']:

    print(column)
    # grab values from the column
    v = data[column].values
    
    # print the min
    print(f"* Min: {np.nanmin(v)}")
    # print max
    print(f"* Max: {np.nanmax(v)}")
    # print the range
    print(f"* Range: {np.nanmax(v)-np.nanmin(v)}")
    # print the mean
    print(f"* Mean: {np.nanmean(v)}")    
    # print the standard deviation
    print(f"* Standard deviation: {np.nanstd(v)}")
    # print the median
    print(f"* Median: {np.nanmedian(v)}")
    # print the 25th and 75th percentiles
    q1 = np.nanpercentile(v, 25)
    print(f"* 25th percentile (Q1): {q1}")
    q3 = np.nanpercentile(v, 75)
    print(f"* 75th percentile (Q3): {q3}")
    # print the IQR
    iqr = q3-q1
    print(f"* IQR: {iqr}\n")

In [None]:
# Export the cleaned data to a new csv file
data.to_csv('class_survey_data_cleaned.csv', index = False)

## *STOP PLEASE. THE FOLLOWING IS FOR THE NEXT EXERCISE. THANKS.*


## SECTION 4: Dealing with missing values

In [None]:
# Check the missing values in each column
data.isnull().sum()

In [None]:
# Check missing value distribution using missingno library
msno.matrix(data)

In [None]:
# Print rows with more than one missing value
data[data.isnull().sum(axis=1) > 1]

In [None]:
# Remove rows with more than one missing value
data = data[data.isnull().sum(axis=1) <= 1]

In [None]:
# Replace the missing values in BACKGROUND_YEARS_PROFESSIONAL with the median
data['BACKGROUND_YEARS_PROFESSIONAL'].fillna(data['BACKGROUND_YEARS_PROFESSIONAL'].median(), inplace=True)

In [None]:
# Replace the missing values in BACKGROUND_YEARS_PROGRAMMING with the mean
data['BACKGROUND_YEARS_PROGRAMMING'].fillna(data['BACKGROUND_YEARS_PROGRAMMING'].mean(), inplace=True)

In [None]:
# Replace the missing values in the BACKGROUND_SKILLS column with 'None'
data['BACKGROUND_SKILLS'].fillna('None', inplace=True)

In [None]:
# Check the missing values in each column
data.isnull().sum()