### Indeed.ca Data Munging

#### Python Pandas

In [2]:
import pandas as pd

In [5]:
# https://www.pythonprogramming.in/how-to-read-specific-columns-of-csv-file-using-pandas.html
# reading in specific columns by name
data = pd.read_csv("sept_final_scrape.csv", usecols = ['job_title','company_name','location','summary','salary'])

In [5]:
# http://www.datasciencemadesimple.com/get-unique-values-rows-dataframe-python-pandas/
# duplicated postings in raw scrape, keep only unique
df = data.drop_duplicates()
df.head()

Unnamed: 0,job_title,company_name,location,summary,salary
0,"Summer Intern, Data Science & Engineering",HSBC Bank Canada,"Toronto, ON",This could be building and changing predictive...,Nothing_found
1,Student Intern,Skyworks,"Ottawa, ON",Has strong data analysis skill. Support PEs wi...,Nothing_found
2,Dog Sitter,Rover,"Ottawa, ON",Rover dog sitters come from a variety of backg...,"$1,000 a week"
3,Care for Dogs,Rover,"Ottawa, ON",Rover dog sitters come from a variety of backg...,"$1,000 a week"
4,Dog Boarding,Rover,"Ottawa, ON",Rover dog sitters come from a variety of backg...,"$1,000 a week"


In [42]:
# https://stackoverflow.com/questions/28885073/reindexing-after-pandas-drop-duplicates
# reindex using pandas builtin
# need to reindex otherwise can't loop
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,job_title,company_name,location,summary,salary
0,"Summer Intern, Data Science & Engineering",HSBC Bank Canada,"Toronto, ON",This could be building and changing predictive...,Nothing_found
1,Student Intern,Skyworks,"Ottawa, ON",Has strong data analysis skill. Support PEs wi...,Nothing_found
2,Dog Sitter,Rover,"Ottawa, ON",Rover dog sitters come from a variety of backg...,"$1,000 a week"
3,Care for Dogs,Rover,"Ottawa, ON",Rover dog sitters come from a variety of backg...,"$1,000 a week"
4,Dog Boarding,Rover,"Ottawa, ON",Rover dog sitters come from a variety of backg...,"$1,000 a week"


In [49]:
# make another df with more relevant job titles
columns = ["job_title", "company_name", "location", "summary", "salary"]
df2 = pd.DataFrame(columns = columns)

a = 0
substrings = ['Data', 'Intern'] #test substrings
for index in df.index:
    for substring in substrings:
        if substring in df.iloc[index][0]:
            df2.loc[a] = df.iloc[index]
            a += 1
        else:
            continue

In [52]:
pd.set_option('display.max_columns', None)
df2

Unnamed: 0,job_title,company_name,location,summary,salary
0,"Summer Intern, Data Science & Engineering",HSBC Bank Canada,"Toronto, ON",This could be building and changing predictive...,Nothing_found
1,"Summer Intern, Data Science & Engineering",HSBC Bank Canada,"Toronto, ON",This could be building and changing predictive...,Nothing_found
2,Student Intern,Skyworks,"Ottawa, ON",Has strong data analysis skill. Support PEs wi...,Nothing_found
3,Data Scientist Intern,Geotab,"Oakville, ON","As a Data Scientist Intern, you will work alon...",Nothing_found
4,Data Scientist Intern,Geotab,"Oakville, ON","As a Data Scientist Intern, you will work alon...",Nothing_found
5,"Data Engineer, Omnia AI - Co-op/Intern Winter ...",Deloitte,"Toronto, ON","Fascinated with data structures, data models a...",Nothing_found
6,"Data Engineer, Omnia AI - Co-op/Intern Winter ...",Deloitte,"Toronto, ON","Fascinated with data structures, data models a...",Nothing_found
7,Data Science Intern - Winter 2020 (Toronto),Intact,"Toronto, ON",We are currently looking for interns for our D...,Nothing_found
8,Data Science Intern - Winter 2020 (Toronto),Intact,"Toronto, ON",We are currently looking for interns for our D...,Nothing_found
9,Data Insight Specialist (Co-op/Intern),General Motors,"Oshawa, ON",The Data Insight Specialist will assist the wa...,Nothing_found


In [80]:
# read all csvs
# save content from each in one dataframe; decided not to save in separate ones because overlap too great

csv_list = ["data+intern","devops+intern","full+stack+intern","python+intern", "tester+intern"]

# heuristic process of coming up with relevant keywords
# for example, in devops csv, jobs like "backend developer" shows, still relevant
# how to allow for wobble in keyword search?
keyword_list = ['Data',
                'Intern',
                'Devops',
                'Programmer',
                'Developer',
                'Backend',
                'Frontend',
                'Stack',
                'Engineer',
                'Automation',
                'Software',
                'Co-op']


columns = ["job_title", 
               "company_name", 
               "location", 
               "summary", 
               "salary"]

df2 = pd.DataFrame(columns = columns)
a = 0

for csv in csv_list:
    data = pd.read_csv(csv + ".csv", usecols=['job_title',
                                              'company_name',
                                              'location',
                                              'summary',
                                              'salary'])
    df = data.drop_duplicates()
    df = df.reset_index(drop=True)

    for index in df.index:
        for keyword in keyword_list:
            if keyword in df.iloc[index][0]:
                df2.loc[a] = df.iloc[index]
                a += 1
            else:
                continue

# last drop of dupes, reset index for final
df_final = df2.drop_duplicates()
df_final = df_final.reset_index(drop=True)

In [81]:
df_final

Unnamed: 0,job_title,company_name,location,summary,salary
0,"Summer Intern, Data Science & Engineering",HSBC Bank Canada,"Toronto, ON",This could be building and changing predictive...,Nothing_found
1,Student Intern,Skyworks,"Ottawa, ON",Has strong data analysis skill. Support PEs wi...,Nothing_found
2,Data Scientist Intern,Geotab,"Oakville, ON","As a Data Scientist Intern, you will work alon...",Nothing_found
3,"Data Engineer, Omnia AI - Co-op/Intern Winter ...",Deloitte,"Toronto, ON","Fascinated with data structures, data models a...",Nothing_found
4,Data Science Intern - Winter 2020 (Toronto),Intact,"Toronto, ON",We are currently looking for interns for our D...,Nothing_found
5,Data Insight Specialist (Co-op/Intern),General Motors,"Oshawa, ON",The Data Insight Specialist will assist the wa...,Nothing_found
6,Data Analyst Intern,Upfeat Media Inc.,"Winnipeg, MB",We are looking for a Data Analyst Intern to jo...,Nothing_found
7,"Intern, Data Science",RS Energy Group,"Calgary, AB",Analyze and interpret heterogeneous data. Trou...,Nothing_found
8,Instructional Writer - Data Science Intern,Geotab,"Oakville, ON",Geotab is seeking an intern who will immediate...,Nothing_found
9,CO OP/Intern: Photonics Networking,Ciena,"Ottawa, ON",The Autonomous Photonic Networks team is respo...,Nothing_found


In [83]:
# for use in another script for analysis
df_final.to_csv("september_test_scrapes/sept_final_scrape.csv")

In [8]:
data.describe()

Unnamed: 0,job_title,company_name,location,summary,salary
count,976,976,976,976,976
unique,521,252,74,895,25
top,Marketing Intern,TD Bank,"Toronto, ON",Tell us your story. Don't go unnoticed. Explai...,Nothing_found
freq,11,99,347,8,933
