In [None]:
# Import main libraries
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import os

In [None]:
pd.set_option('display.max_columns', None)

## 1. Data Gathering
Gathering é o primeiro passo da etapa de Data Wrangling.

**Best Practice:** Download de arquivos programaticamente.

**Motivo:**

* Scalability: Essa automação poupa tempo e previne erros.
* Reproducibility: Qualquer um pode reproduzir seu trabalho.

Explicar o que é o ciclo **REQUEST - RESPONSE**

In [None]:
main_data_dir = "world_data"

if not os.path.exists(main_data_dir):
    os.makedirs(main_data_dir)

In [None]:
# Downloading the zipped file
import requests

url = "https://storage.googleapis.com/kaggle-data-sets/1844/3192/bundle/archive.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1588731384&Signature=OvJwgiXmSCy6znB2fQ96%2BUDvqHNzIEJb52DJilVXEdZvh2RLarUgzBEO2KcCDcnZwUXqSWTzqgEfsIE7%2F7WR2PHoBKuFDLBlrgK%2FQwIfE6M%2Fj8BdVhe%2FnsnCQaKSVF14cQB7tFcg6F8CcHYkVEKdokotXwHVViVwn4%2Br3owzoXfsMCaOum9CVXJd429hiUgUB5vXO4HCDtpVMflFGAnsc8JDK5jnmyfGw8TNGb2qLZaJZ8S7%2BiriLX6acCwiMvGz3L2%2BhRVzZvUddKm57C8OLXMsFrTvkicZXbIA9WHfMuxBKb1g4OLV0WNBKO27A43%2B0KQ2wzmApSvAbk4rcGMe4A%3D%3D&response-content-disposition=attachment%3B+filename%3Darmenian-online-job-postings.zip"
response = requests.get(url)

with open(os.path.join(main_data_dir, "armenian-online-job-postings.zip"), mode="wb") as file:
    file.write(response.content)

In [None]:
# Unizzipng the zipped file
import zipfile

# Create directory if it yet doesn't exist
extract_dir = os.path.join(main_data_dir, "armenian_data")

if not os.path.exists(extract_dir):
    os.makedirs(extract_dir)

with zipfile.ZipFile(os.path.join(main_data_dir, "armenian-online-job-postings.zip"), "r") as meu_zip:
    meu_zip.extractall(extract_dir)

### 1.1 Loading Data

In [None]:
df_job = pd.read_csv(os.path.join(extract_dir, "online-job-postings.csv"))

## 2. Data Assessing

In [None]:
# Print first 5 rows
df_job.head()

In [None]:
# Print the last 5 rows.
df_job.tail()

In [None]:
# Print dataframe info
df_job.info()

In [None]:
# Summarize a list with quantity of each category.
df_job["StartDate"].value_counts()[:50]

#### Quality
*dataframe1*

* Start Data possui vários valores que representam a mesma coisa: ASAP
* Nomes das colunas não estão no padrão
* Colunas `ApplicationC` e `AboutC` não são descritivas o suficiente

*dataframe2*

#### Tidiness

## 3. Data Cleaning 

Improving the **quality** of a dataset or cleaning the dataset do not means: Changing the data (because it could be **data fraud**).

The meaning of Cleaning is correcting the data or removing the data.

* Innacurate, wrong or irrelevant data.
* Replacing or filling (NULL or NA values) data.
* Combining/Merging datasets.

Improving the **tidiness** is transform the dataset to follow:

* each observation = row
* each variable = column

There are two ways to cleaning the data: manually and programmatic.

#### Manually

To be avoided.

#### Programmatic

There are three steps:

1. Define
2. Code
3. Test

>**Defining** means defining a data cleaning plan in writing, where we turn our assessments into defined cleaning tasks. This plan will also serve as an instruction list so others (or us in the future) can look at our work and reproduce it.

>**Coding** means translating these definitions to code and executing that code.

>**Testing** means testing our dataset, often using code, to make sure our cleaning operations worked.

Text from the class notes.

In [None]:
# Copy of original
df_clean = df_job.copy()

#### 3.1. Fixing the columns header

In [None]:
df_clean.columns = [col.lower() for col in df_clean.columns]

df_clean = df_clean.rename(columns={"applicationp": "application_procedure",
                                    "aboutc": "about_company",
                                    "requiredqual": "required_qualifications",
                                    "job_requirment": "job_requirements"})

#### 3.2. Set single name for ASAP start_dates

In [None]:
asap_list = []

for start_date in df_clean.startdate.dropna().unique():
    
    lower_str = start_date.lower()
    
    if "immediat" in lower_str or "asap" in lower_str or "as soon as possible" in lower_str  or \
    "earliest" in lower_str or "upon" in lower_str:
        asap_list.append(start_date)

asap_list.pop(asap_list.index('Upon availability'))
asap_list

In [None]:
def replace_asap(row):
    start_date = row["startdate"]
    
    if pd.isna(start_date):
        return np.nan
    
    if start_date in asap_list:
        return "ASAP"
    else:
        return start_date

    
df_clean["startdate"] = df_clean.apply(replace_asap, axis=1)

In [None]:
df_clean["startdate"].value_counts()[:50]

In [None]:
df_clean.to_csv("clean-online-job-postings.csv")