# Cleaning

## Import Libraries

In [53]:
import pandas as pd
import re

In [54]:
# Cleaning functions (py file) placed in src folder.
import sys
sys.path.append("/")
from src.cleaning import *

## Import data from scrapping

In [55]:
mysql = pd.read_csv("mysql.csv")

In [56]:
mongo = pd.read_csv("mongo.csv")

In [57]:
tableau = pd.read_csv("tableau.csv")

In [58]:
python = pd.read_csv("python.csv")

In [59]:
ml = pd.read_csv("machine_learning.csv")

In [60]:
java = pd.read_csv("java.csv")

In [61]:
courses = pd.concat([mysql, mongo, tableau, python, ml, java])

In [62]:
courses = courses.reset_index()

In [63]:
courses.drop('Unnamed: 0', axis=1, inplace=True)

In [64]:
courses.drop('index', axis=1, inplace=True)

## Evaluation of data

In [65]:
courses.shape

(1720, 6)

In [66]:
courses.head(3)

Unnamed: 0,course,rating,students,level,university,skills
0,Learn SQL Basics for Data Science,4.6,400 mil,Beginner,"University of California, Davis",MySQL
1,Excel to MySQL: Analytic Techniques for Business,4.6,660 mil,Beginner,Duke University,MySQL
2,Introduction to Structured Query Language (SQL),4.8,140 mil,Intermediate,University of Michigan,MySQL


In [67]:
courses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1720 entries, 0 to 1719
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   course      1720 non-null   object
 1   rating      1720 non-null   object
 2   students    1720 non-null   object
 3   level       1720 non-null   object
 4   university  1720 non-null   object
 5   skills      1720 non-null   object
dtypes: object(6)
memory usage: 80.8+ KB


In [68]:
# Check if I have duplicated courses obtained from my different courses search.

In [69]:
# Hago una comprobación para ver si tengo cursos repetidos en mis búsquedas de los diferentes cursos.

In [70]:
no_skills= courses.drop(['skills'], axis=1)

In [71]:
no_skills.head()

Unnamed: 0,course,rating,students,level,university
0,Learn SQL Basics for Data Science,4.6,400 mil,Beginner,"University of California, Davis"
1,Excel to MySQL: Analytic Techniques for Business,4.6,660 mil,Beginner,Duke University
2,Introduction to Structured Query Language (SQL),4.8,140 mil,Intermediate,University of Michigan
3,Managing Big Data with MySQL,4.7,170 mil,Mixed,Duke University
4,Building a Dynamic Web App using PHP & MySQL,4.4,"5,8 mil",Beginner,Coursera Project Network


In [72]:
no_skills.shape

(1720, 5)

In [73]:
no_skills = no_skills.drop_duplicates()

In [74]:
no_skills.shape

(1508, 5)

In [75]:
# For the moment in time, I do not drop any row but when I face EDA I will have in mind that I have courses duplicated and
# probably, I will have to groupby course when the analysis requires it.

In [76]:
# Tengo cursos duplicados en mis diferentes búsquedas.
# Por ahora lo dejo indicado así y ya sé que de cara al EDA tendré que agrupar cuando me interese un análisis por curso.

### Rating Column

In [77]:
# Transform Rating column to numeric as well as "None" string to nan data.

In [78]:
# Hago la columna Rating numérica y convierto mi string "None" en un nan

In [79]:
courses.rating.unique()

array(['4.6', '4.8', '4.7', '4.4', '4.3', '4.5', '4.9', 'None', '5',
       '3.9', '4.2', '4.1', '3.7', '4', '3.8', '3.4', '3', '3.2', '3.3',
       '3.1', '2.5', '3.6', '2.3', '3.5'], dtype=object)

In [80]:
courses['rating'] = pd.to_numeric(courses.rating, errors='coerce')

In [81]:
courses.rating.unique()

array([4.6, 4.8, 4.7, 4.4, 4.3, 4.5, 4.9, nan, 5. , 3.9, 4.2, 4.1, 3.7,
       4. , 3.8, 3.4, 3. , 3.2, 3.3, 3.1, 2.5, 3.6, 2.3, 3.5])

### Level Column

In [82]:
# Transform "None" string to None. For my analysis, it does not worth transform this column into numeric values.

In [83]:
courses.level.unique()

array(['Beginner', 'Intermediate', 'Mixed', 'Advanced', 'None'],
      dtype=object)

In [84]:
courses['level'].replace({"None":None}, inplace=True)

In [85]:
courses.level.unique()

array(['Beginner', 'Intermediate', 'Mixed', 'Advanced', None],
      dtype=object)

### Students Column

In [86]:
# Transform students column formed by strings into digits.

In [87]:
# Save into variable students a list of unique values of students.
students = list(courses.students.unique())

In [88]:
# Make a first transformation with Regex.
lista=[]
for i in students:
    lista.append(re.sub(" mil", "", i))

In [89]:
# Make a second transformation with Regex.
lista_2=[]
for i in lista:
    lista_2.append(re.sub(",", ".", i))

In [90]:
# Prepare a dictionary to replace directly in column student of dataframe courses.
dict_from_list = dict(zip(students, lista_2))

In [91]:
courses=courses.replace({"students": dict_from_list})

In [98]:
# This dictionary is for another transformation in column of students.

dict_={'1.2 m': 1200, '1 m': 1000, '2.7 m':2700, '2.5 m': 2500, '4.7 m':4700, '1.3 m':1300}

In [99]:
courses=courses.replace({"students": dict_})

In [100]:
courses.students.unique()

array(['400', '660', '140', '170', '5.8', '28', '390', 'None', '8.6', '5',
       '22', '1.9', '300', '2', '1.8', '11', '130', '4.5', '56', '41',
       '20', '18', '2.9', '13', '110', '150', '52', '3.2', '8.7', '59',
       '2.8', '1.7', '6.8', '6.2', '360', '350', '35', '250', '46', '43',
       '2.7', '29', '50', '370', '160', '240', '45', '95', '82', '21',
       '12', '5.6', '53', '3.4', '1.5', '4.3', '14', '2.3', '5.2', '6.9',
       '3.3', '2.5', '4.7', '4.2', '190', '270', '70', '7.5', '44', '58',
       '2.2', '3.6', '6.1', '4.8', '3.5', '17', '5.4', '37', '51', '9.8',
       '30', '6.5', '740', '210', '24', '100', '48', '4.1', '3.9', '710',
       '16', '2.1', '4', '5.3', 2700, '600', '520', '570', '800', 2500,
       '380', 1000, '470', '80', '550', '720', '73', '120', '200', '280',
       '25', '830', '500', 4700, '75', '330', '8.4', '54', '230', '4.6',
       '410', '49', '27', '97', '9.3', '220', '7.6', '85', '8.5', '32',
       '39', '62', '6.6', '15', '84', '36', '33', 

In [101]:
# Now This column is ready to be transformed into numeric values.
courses['students'] = pd.to_numeric(courses.students, errors='coerce')

In [102]:
courses.students.unique()

array([4.0e+02, 6.6e+02, 1.4e+02, 1.7e+02, 5.8e+00, 2.8e+01, 3.9e+02,
           nan, 8.6e+00, 5.0e+00, 2.2e+01, 1.9e+00, 3.0e+02, 2.0e+00,
       1.8e+00, 1.1e+01, 1.3e+02, 4.5e+00, 5.6e+01, 4.1e+01, 2.0e+01,
       1.8e+01, 2.9e+00, 1.3e+01, 1.1e+02, 1.5e+02, 5.2e+01, 3.2e+00,
       8.7e+00, 5.9e+01, 2.8e+00, 1.7e+00, 6.8e+00, 6.2e+00, 3.6e+02,
       3.5e+02, 3.5e+01, 2.5e+02, 4.6e+01, 4.3e+01, 2.7e+00, 2.9e+01,
       5.0e+01, 3.7e+02, 1.6e+02, 2.4e+02, 4.5e+01, 9.5e+01, 8.2e+01,
       2.1e+01, 1.2e+01, 5.6e+00, 5.3e+01, 3.4e+00, 1.5e+00, 4.3e+00,
       1.4e+01, 2.3e+00, 5.2e+00, 6.9e+00, 3.3e+00, 2.5e+00, 4.7e+00,
       4.2e+00, 1.9e+02, 2.7e+02, 7.0e+01, 7.5e+00, 4.4e+01, 5.8e+01,
       2.2e+00, 3.6e+00, 6.1e+00, 4.8e+00, 3.5e+00, 1.7e+01, 5.4e+00,
       3.7e+01, 5.1e+01, 9.8e+00, 3.0e+01, 6.5e+00, 7.4e+02, 2.1e+02,
       2.4e+01, 1.0e+02, 4.8e+01, 4.1e+00, 3.9e+00, 7.1e+02, 1.6e+01,
       2.1e+00, 4.0e+00, 5.3e+00, 2.7e+03, 6.0e+02, 5.2e+02, 5.7e+02,
       8.0e+02, 2.5e

In [103]:
courses.head()

Unnamed: 0,course,rating,students,level,university,skills
0,Learn SQL Basics for Data Science,4.6,400.0,Beginner,"University of California, Davis",MySQL
1,Excel to MySQL: Analytic Techniques for Business,4.6,660.0,Beginner,Duke University,MySQL
2,Introduction to Structured Query Language (SQL),4.8,140.0,Intermediate,University of Michigan,MySQL
3,Managing Big Data with MySQL,4.7,170.0,Mixed,Duke University,MySQL
4,Building a Dynamic Web App using PHP & MySQL,4.4,5.8,Beginner,Coursera Project Network,MySQL


In [104]:
courses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1720 entries, 0 to 1719
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   course      1720 non-null   object 
 1   rating      1335 non-null   float64
 2   students    1132 non-null   float64
 3   level       1701 non-null   object 
 4   university  1720 non-null   object 
 5   skills      1720 non-null   object 
dtypes: float64(2), object(4)
memory usage: 80.8+ KB


In [109]:
courses.shape

(1720, 6)

In [105]:
# Export this dataframe into csv
courses.to_csv("coursera_cleaned.csv")