In [1]:
#Importing Dependencies
import requests
import bs4
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import csv

#Importing Sci-kit + Stats Models Dependencies
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score

#Models
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB

#GridSearch
from sklearn.model_selection import GridSearchCV

#Pre-processing
from sklearn.preprocessing import StandardScaler

#Scipy Integration for Sparse Matrixes
from scipy import sparse

#Plotting & Visualisation Metrics
from matplotlib import pyplot as plt
import seaborn as sns

#Scoring & Evaluation Metrics
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, precision_score, f1_score
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score

#Pipelines
from sklearn.pipeline import Pipeline

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

class Cleaner(BaseEstimator, TransformerMixin):
    
    def __init__(self, X, salary_column= None, cols_to_drop = None, columns_to_dummify= None):
        self.salary_column = salary_column
        self.salary_median = X[self.salary_column].median()
        self.cols_to_drop = cols_to_drop
        self.columns_to_dummify= columns_to_dummify
        self.X = X
    
    
    #Creating The Salary Class 
    def salary_class(self):
        data = ['HIGH SALARY' if item > self.salary_median else 'LOW SALARY' for item in self.X['Salaries($)']]
        return data
    

    #Creating The States + Locations Features
    def state_and_locations(self):
        states = [str(item).split()[-1] for item in self.X['Locations']]
        locations = [(" ".join(str(item).split()[:-1])).replace(',','') for item in cleaned_df['Locations']]   
        return states, locations
    
    
    def dropping_cols(self):
        return self.X.drop(columns= self.cols_to_drop, inplace=True)
    
    
    def make_dummy_cols(self):
        for column in self.columns_to_dummify:
            try:
                categories = np.sort(self.X[column].unique())
                for category in categories[1:]:
                    self.X[column+'_'+str(category)] = self.X[column].map(
                        lambda x: 1 if x == category else 0)
                self.X = self.X.drop(column, axis=1)
            except:
                pass
        return self.X
    
    
    def transform(self, *args):
        #Cleaning & Adding Features.
        self.X['Salary_Class'] = self.salary_class()
        self.X['State'] = self.state_and_locations()[0]
        self.X['Locations'] = self.state_and_locations()[1]
        
        #Dropping Columns.
        self.dropping_cols()
        
        #Dropping NA Values.
        self.X.dropna(how='any', inplace=True)
        
        #Dummfying The Data
        self.make_dummy_cols()
        return self.X
     
        
    def fit(self, *args):
        pass

In [74]:
# Create a TFID_Vectorizer Helper Class For The Pipeline!

from sklearn.feature_extraction.text import TfidfVectorizer

class TFID_Vectorizer(BaseEstimator, TransformerMixin):
    
    def __init__(self, X, cols_to_vectorize = None):
        self.cols_to_vectorize = cols_to_vectorize   
        self.tvec = TfidfVectorizer(stop_words='english')
        self.original_sparse = None
        
        
    def fit(self, X, y=None, *args):
        self.tvec = self.tvec.fit(X[self.cols_to_vectorize])
        X = X[X.columns.difference(['Titles'])]
        self.original_sparse = sparse.csr_matrix(X.values)
        return self.tvec
 

    def transform(self, X, y=None, *args):
        if isinstance(X, pd.DataFrame):            
            Additional_Data = X[X.columns.difference(['Titles'])]
            Sparse_Original_Data = sparse.csr_matrix(Additional_Data.values)
            Y = self.tvec.transform(X[self.cols_to_vectorize])
            X = sparse.hstack([Y, Sparse_Original_Data])
            
        else:    
            print('test')

In [75]:
cleaned_df = pd.read_csv("exported_dataframe.csv")
Clean_mod = Cleaner(salary_column='Salaries($)', X = cleaned_df, cols_to_drop=['Companies', 'Salaries($)']
                   , columns_to_dummify = ['State', 'Locations'] )
X = Clean_mod.transform()
y = X.pop('Salary_Class')

In [76]:
#Performing A Train Test Split.

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=1, stratify=y)

In [77]:
testing = TFID_Vectorizer(X, cols_to_vectorize='Titles')

In [78]:
testing.fit_transform(X_train)

<87x304 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [37]:
len(train)

NameError: name 'train' is not defined

In [55]:
testing.fit_transform(X_train)


(230, 86)


<87x304 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

X_test Data Shape 

- Sparse Original Shape: 58 rows ,86 columns
- Job Titles Original Shape: 58 rows, 304 columns
- New Sparse Matrix Shape: 58 rows, 390 columns

In [39]:
testing.fit_transform(X_train)

(230, 86)


<87x304 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [40]:
#Creating The Pipeline

pipe = Pipeline(memory=None,
         steps=[('TFID_Vectorizing', testing,
                ('LogisticRegression', LogisticRegressionCV())] verbose=False)

SyntaxError: invalid syntax (<ipython-input-40-3879ebf30003>, line 5)

In [41]:
pipeline

NameError: name 'pipeline' is not defined

In [48]:
a = sparse.csr_matrix(X_train[X_train.columns.difference(['Titles'])])

In [49]:
a.shape

(230, 86)

In [53]:
(sparse.hstack([a, a])).shape

(460, 86)