This file takes care of the Data Understanding and Data Preparation steps of the CRISP DM process.

Read in the raw data and the schema

In [21]:
import numpy as np
import pandas as pd
from IPython import display
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
%matplotlib inline

Gather: Collect Initial Data

In [22]:
df = pd.read_csv(r'survey_results_public_20.csv', sep = ',', error_bad_lines = False, index_col = False)
df_schem = pd.read_csv(r'survey_results_schema_20.csv', sep = ',', error_bad_lines = False, 
                       index_col = False)
df

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
0,1,I am a developer by profession,Yes,,13,Monthly,,,Germany,European Euro,...,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",ASP.NET Core,ASP.NET;ASP.NET Core,Just as welcome now as I felt last year,50.0,36,27
1,2,I am a developer by profession,No,,19,,,,United Kingdom,Pound sterling,...,,,,"Computer science, computer engineering, or sof...",,,Somewhat more welcome now than last year,,7,4
2,3,I code primarily as a hobby,Yes,,15,,,,Russian Federation,,...,Neither easy nor difficult,Appropriate in length,,,,,Somewhat more welcome now than last year,,4,
3,4,I am a developer by profession,Yes,25.0,18,,,,Albania,Albanian lek,...,,,No,"Computer science, computer engineering, or sof...",,,Somewhat less welcome now than last year,40.0,7,4
4,5,"I used to be a developer by profession, but no...",Yes,31.0,16,,,,United States,,...,Easy,Too short,No,"Computer science, computer engineering, or sof...",Django;Ruby on Rails,Ruby on Rails,Just as welcome now as I felt last year,,15,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64456,64858,,Yes,,16,,,,United States,,...,,,,"Computer science, computer engineering, or sof...",,,,,10,Less than 1 year
64457,64867,,Yes,,,,,,Morocco,,...,,,,,,,,,,
64458,64898,,Yes,,,,,,Viet Nam,,...,,,,,,,,,,
64459,64925,,Yes,,,,,,Poland,,...,,,,,Angular;Angular.js;React.js,,,,,


Assess: Describe/Explore Data

In [23]:
df.describe()

Unnamed: 0,Respondent,Age,CompTotal,ConvertedComp,WorkWeekHrs
count,64461.0,45446.0,34826.0,34756.0,41151.0
mean,32554.079738,30.834111,3.190464e+242,103756.1,40.782174
std,18967.44236,9.585392,inf,226885.3,17.816383
min,1.0,1.0,0.0,0.0,1.0
25%,16116.0,24.0,20000.0,24648.0,40.0
50%,32231.0,29.0,63000.0,54049.0,40.0
75%,49142.0,35.0,125000.0,95000.0,44.0
max,65639.0,279.0,1.1111110000000001e+247,2000000.0,475.0


Clean: Select Data

Only work with columns that have at least 70% reported data

In [24]:
def remove_nan(df, threshold):

    """
    FUNCTION: remove any columns with less than the given threshold of responses

    INPUTS:
        df - data frame with data to clean
        threshold - minimum percentage of responses needed to keep a column

    OUTPUTS:
        df with only columns that have the threshold of data
    """

    df = df.dropna(axis = 1,thresh = (threshold*df.shape[0]))
    return  df

df = remove_nan(df,0.7)

Remove categories irrelevant to the proejct purpose

In [25]:
to_drop = ['SOAccount','SOComm','SOPartFreq','SOVisitFreq','SurveyEase','SurveyLength','WelcomeChange','NEWOffTopic']
df = df.drop(to_drop, axis = 1)

Reduce schema to represent only columns remaining in data set

In [26]:
def remove_schema_rows(df,schema):

    """
    FUNCTION: remove schema information related to the columns removed in the previous function

    INPUTS:
        df - data frame with removed columns
        schema - data frame of raw data schema

    OUTPUTS:
        data frame with the reduced schema
    """

    col_names = list(df.columns)
    new_schema = schema[~schema.Column.isin(col_names) == False]
    return new_schema

remove_schema_rows(df,df_schem)

Unnamed: 0,Column,QuestionText
0,Respondent,Randomized respondent ID number (not in order ...
1,MainBranch,Which of the following options best describes ...
2,Hobbyist,Do you code as a hobby?
3,Age,What is your age (in years)? If you prefer not...
4,Age1stCode,At what age did you write your first line of c...
8,Country,Where do you live?
9,CurrencyDesc,Which currency do you use day-to-day? If your ...
10,CurrencySymbol,Which currency do you use day-to-day? If your ...
12,DatabaseWorkedWith,Which database environments have you done exte...
13,DevType,Which of the following describe you? Please se...


Save cleaned data files for use in analysis

In [27]:
df.to_csv(r'C:\\Users\\Jessica\\pj1_stackoverflow\\pj1-stackoverflow\\reduced_data.csv', index = False)
df_schem.to_csv(
    r'C:\\Users\\Jessica\\pj1_stackoverflow\\pj1-stackoverflow\\reduced_data_schema.csv', index = False)