In [72]:
import pandas as pd
import matplotlib.pyplot as plt

def get_datasets():
    try:
        data = pd.read_csv("datasets/Win-Loss.csv")
    except FileNotFoundError:
        import urllib
        tmp = urllib.request.urlopen("https://community.watsonanalytics.com/wp-content/uploads/2015/04/WA_Fn-UseC_-Sales-Win-Loss.csv")
        with open('datasets/Win-Loss.csv','wb') as output:
            output.write(tmp.read())
        data = pd.read_csv("datasets/Win-Loss.csv")
    return data

def get_only_numerics(data):
    data_types_num = []
    for key,value in data_types.items():
        if value == 'int64' or value == 'float64':
            data_types_num.append(key)
    return data_types_num

def view_graphs(sales_data):
    # import the seaborn module
    import seaborn as sns

    # setting the plot size for all plots
    sns.set(rc={'figure.figsize':(20.7,20.27)})

    sales_data.hist()
    plt.show()
    # create a countplot
    sns.countplot(x="Route To Market",data=sales_data,hue = 'Opportunity Result')
    # display the plot
    plt.show()

def convert_numeric(sales_data):
    #import the necessary module
    from sklearn import preprocessing

    # create the Labelencoder object
    le = preprocessing.LabelEncoder()

    #convert the categorical columns into numeric
    sales_data['Supplies Subgroup'] = le.fit_transform(sales_data['Supplies Subgroup'])
    sales_data['Region'] = le.fit_transform(sales_data['Region'])
    sales_data['Route To Market'] = le.fit_transform(sales_data['Route To Market'])
    sales_data['Opportunity Result'] = le.fit_transform(sales_data['Opportunity Result'])
    sales_data['Competitor Type'] = le.fit_transform(sales_data['Competitor Type'])
    sales_data['Supplies Group'] = le.fit_transform(sales_data['Supplies Group'])
    return sales_data

def prepare_data_for_test(sales_data):
    #select columns other than 'Opportunity Number','Opportunity Result'
    cols = [col for col in sales_data.columns if col not in ['Opportunity Number','Opportunity Result']]

    #dropping the 'Opportunity Number'and 'Opportunity Result' columns
    data = sales_data[cols]

    #assigning the Oppurtunity Result column as target
    target = sales_data['Opportunity Result']
    return data,target

def train_sets(data,target):
    #import the necessary module
    from sklearn.model_selection import train_test_split
    #split data set into train and test sets
    data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 10)
    return data_train, data_test, target_train, target_test

sales_data = get_datasets()

# to view the first few records of the imported data set
sales_data.head()

# Using .tail() method to view the last few records from the dataframe
sales_data.tail()

# using the dtypes() method to display the different datatypes available
data_types = sales_data.dtypes

#convert not numeric data in to numeric
sales_data = convert_numeric(sales_data)
sales_data.head()

#prepare our data for test
data,target = prepare_data_for_test(sales_data)
data.head()

#create our train sets
data_train, data_test, target_train, target_test = train_sets(data,target)



from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

#create object of the lassifier
neigh = KNeighborsClassifier(n_neighbors=3)

#Train the algorithm
neigh.fit(data_train, target_train)

# predict the response
pred = neigh.predict(data_test)

# evaluate accuracy
print ("KNeighbors accuracy score : ",accuracy_score(target_test, pred))


KNeighbors accuracy score :  0.8167254085229093
