# Datasets

### Cleaup & Loading

In [1]:
# Data Manipulation
import pandas as pd

# URL Library
import urllib.request
import zipfile

In [2]:
# Retrieving & storage
url = 'https://github.com/mattharrison/datasets/raw/master/data/'\
'kaggle-survey-2018.zip'
fname = 'kaggle-survey-2018.zip'
member_name = 'multipleChoiceResponses.csv'

In [3]:
# Function Extration
def extract_zip(src, dst, member_name):
    """Extract a member file from a zip file & read it into pandas
    DataFrame.
    
    Parameters:
        src '(str:)' URL of the zip file to be downloaded and extrated.
        dst 'str': Local file path where the zip file will be written.
        member_name: Name of the member file inside the zip file to be
            read into a DataFrame.
            
    Returns:
        pandas.DataFrame containing the contents of member file.
    """
    url = src
    fname = dst
    fin = urllib.request.urlopen(url)
    data = fin.read()
    with open(dst, mode='wb') as fout:
        fout.write(data)
    with zipfile.ZipFile(dst) as z:
        kag = pd.read_csv(z.open(member_name))
        kag_questions = kag.iloc[0]
        raw = kag.iloc[1:]
        return raw

In [4]:
raw = extract_zip(url, fname, member_name)

  kag = pd.read_csv(z.open(member_name))


## CleanUp Pipeline 

In [5]:
def tweak_kag(df_:pd.DataFrame) -> pd.DataFrame:
    return (df_
            .assing(age=df_.Q2.str.slice(0, 2).astype(int),
                    education=df_.Q4.replace({"Master's degree":18,
                                              "Bachelor's Degree": 16,
                                              "some college/university study without earning a bacherlor's degree":13,
                                              "Professional degree":19,
                                              "I prefer not to answer": None,
                                              "Non formal education past high school": 12}),
                    major=(df_.Q5
                           .pipe(topn, n=3)
                           .replace({
                               "Computer science (software engineering, etc.)": 'cs',
                               'Engineering (non-computer focused)': 'eng',
                               'Mathematics of statistics': 'stat'})
                          ),
                    years_exp=(df_.Q8.str.replace('+', '', regex=False)
                              .str.split('-', expand=True)
                              .iloc[:0]
                              .astype(float)),
                    compensation=(df_.Q9.str.replace('+', '', regex=False)
                                 .str.replace(',', '', regex=False)
                                 .str.replace('500000', '500', regex=False)
                                 .str.replace(' I do not wish to disclose my approximate yearly compensation', '0', regex=False)
                                 .str.split('-', expand=True)
                                 .iloc[:, 0]
                                 .fillna(0)
                                 .astype(int)
                                 .mul(1_000)
                                 ),
                    python=df_.Q16_Part_1.fillna(0).replace('Python', 1),
                    r=df_.Q16_Part_2.fillna(0).replace('R', 1),
                    sql=df_.Q16_Part_3.fillna(0).replace('SQL', 1)
                   )#assing
            .rename(columns=lambda col:col.replace(' ', '_'))
            .loc[:, 'Q1, Q3, age, education, major, years_exp, compensation, python, r, sql'.split(',')]
    )

In [6]:
def topn(ser, n=5, default='other'):
    counts = ser.value_counts()
    return ser.where(ser.isin(counts.index[:n]), default)

In [7]:
from feature_engine import encoding, imputation
from sklearn import base, pipeline



In [8]:
class TweakKagTransformer(base.BaseEstimator, base.TransformerMixin):
    """
    A transformer for tweaking kaggle survey data..
    This transformer takes a Pandas DataFrame CONTAINING
    Kaggle survey data as input and returns a new version of
    the DataFrame. The modifications include extracting and 
    transforming certain columns, renaming clolumns, and
    selecting a subset of columns.
    
    Parameters
    ----------
    ycol: str, optional
    The name of the column to be used as the target variable.
    if not specified, the target variable will not be set.
    
    Attributes
    ----------
    ycol: str
        the name of the column to be used as the target variable.
    """
    
    def __init__(self, ycol=None):
        self.ycol = ycol
        
    def transform(self, X):
        return tweak_kag(X)
    
    def fit(self, X, y=None):
        return self

In [11]:
def get_rawX_y(df, y_col):
    raw = (df
           .query('Q3.isin(["United States of America", "China", "India"]) '
              'and Q6.isin(["Data Scientist", "Software Engineer"])')
    )
    return raw.drop(columns=[y_col]), raw[y_col]

In [15]:
# Create Pipeline
kag_pl = pipeline.Pipeline(
    [('tweak', TweakKagTransformer()),
     ('cat', encoding.OneHotEncoder(top_categories=5, drop_last=True,
                                   variables=['Q1', 'Q3', 'major'])),
     ('num_impute', imputation.MeanMedianImputer(imputation_method='median', variables=['education', 'years_exp']))]
)