<a href="https://colab.research.google.com/github/leosandler/ML_ISU_Project_LeoSandler/blob/master/Linear_ML_Companion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# IMPORTS FOR GOOGLE DRIVE INTEGRATION
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# IMPORTS CENTRAL TO THE CODE
from __future__ import print_function
import math  # Simple math based operations are done through this import. For example, I use the square root function from this import.
from sklearn.impute import SimpleImputer  # Senses for empty sections within the data, used in data scrubbing.
from sklearn.model_selection import train_test_split  # This is used in data splitting, in division on the training and test sets.
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt  # Matplotlib allows for data to be graphed within python.
from matplotlib import patches
import numpy as np  # Allows for more complex mathematics based operations.
from sklearn import metrics
import tensorflow as tf  # The main import which allows for linear machine learning to be performed.
from tensorflow.python.data import Dataset
import pandas as pd  # Pandas is an import which allows for easy manipulation of data sets.
tf.logging.set_verbosity(tf.logging.ERROR)  # A TensorFlow option, recommended by Google. I am unsure of its functionality.
pd.options.display.float_format = '{:.1f}'.format  # A pandas option, recommended by Google. I am unsure of its functionality.


'''
The DATA Class.

The data class is initialized through an inputted data set name and url. Upon initialization, two attributes are set by reading the url as a csv, and then 
extracting the headers or data categories. The self.model boolean value shows if the model class has been instantiated. The self.user_name value holds
None, and is switched to mirror the user's name given in the User class. The self.explained boolean value is False until it is turned true after the user
views an explanation of the data categories. This class holds multiple methods, which will be explained individually.
'''
class Data:
  def __init__(self, name, url):  # Required attributes upon instantiation of the class.
    self.name = name
    self.url = url
    self.pd_data = pd.read_csv(url, sep = ",")
    self.headers = list(self.pd_data.columns.values)
    self.model = False
    self.user_name = None
    self.explained = False
      
  '''
  The data_categories function.
  
  This function uses the exact same input as self.headers, but does not include it within the code as I do not want to change this class attribute.
  The list is looped through and numbered while being printed for the user.
  '''
  def data_categories(self):
    cols = list(self.pd_data.columns.values)  # A list of the headers from the data set.
    for x in range(len(cols)):  # Looping through the list.
      cols[x] = cols[x].replace("_", " ").title()  # Accessing the list at the current index value, depending on the iteration. As well, .replace() and .title()
      # are utilized in order to print the data categories without underscores while being capitalized.
      print(str(x + 1) + ") " + (cols[x]))  # Displaying the category to the user.
  
  '''
  The category_explaining function.
  
  This static method is a function that prints out explanations of the data categories from this machine learning data set. This would be taken out if
  the user uploaded their own data set, as they would understand those categories. This method is important in saving redundant lines of code. This is utilized
  multiple times, therefore it was worth it to put this print statement into a function rather than duplicating it.
  '''
  @ staticmethod  # This is a static method because it does use information from the class. It is needed to explain the California housing data set.
  def category_explaining():
    print("1) Longitude: Classifying homes by how far in the west they are."
          "\n2) Latitude:  Classifying homes by how far north they are."
          "\n3) Housing Median Age: The median age of a home within that house's block."
          "\n4) Total Rooms: The amount of rooms within a block."
          "\n5) Total Bedrooms: The amount of bedrooms in a block."
          "\n6) Population: The amount of people living in the block of homes."
          "\n7) Households: The amount of homes within a block."
          "\n8) Median Income: The block's median income, in tens of thousands USD."
          "\n9) Median House Value: Median value for households within a block, in USD.")
  
  '''
  The data_printing function.
  
  This function allows for the user to see the entire data set, or a smaller portion of it. The to_string function is used to convert the Pandas data frame
  into a string. This is printed, if the user accepts the input after being warned of data set size. The .head() function is used in order to print a preview 
  of the data set if the user would prefer this.
  '''
  def data_printing(self):
    yes = input("Note: Machine learning data sets are very large. This will take up a large amount of space in the console.\nWould you still like to print"
                " the entire data set?\n(Y/N): ").title()  # Input with .title() to avoid capitalization inconsistency.
    if yes == "Y":
      print("Printing the entire " + self.name + " dataset.\n\n")
      print(self.pd_data.to_string())  # Printing the entire data set with the to_string() pandas function.
    else:  # Else is used, as the user may want to see this whether or not they print the entire data set above.
      small_print = input("Do you want to print a smaller version of the data set?\n(Y/N): ").title()  # Input, verified through if statement below.
      if small_print == "Y":
        print("Printing the beginning of the " + self.name + " dataset.\n\n")
        print(self.pd_data.head())  # Printing a portion of the data set with the head() pandas function.
  
  '''
  The histogram function.
  
  This function shows the user all of the data categories, and gives them the ability to see and save to google drive, a histogram. This form of representation
  is important to the purpose of this program, which is a machine learning companion. The histogram_printing function and this work in unison to print out and
  /or save the histogram.  
  '''
  def histogram(self):
    print("This takes one section of your data and turns it into a histogram. A histogram is useful for visualizing the distribution of data.")
    self.data_categories()  # Calling the data_categories function.
    try:  # Try and except to avoid a value error if a string is entered.
      user_cat = int(input("\nYour Input(As a number): "))
      if 1 <= user_cat <= int(len(self.headers)):  # These numbers match with the numbering in the data_categories function.
        print("You selected a histogram for " + self.headers[user_cat - 1].replace("_", " ") + ".")  # Notifying the user of their choice.
      else:
        print("Please enter a valid number.")  # Notifying the user of incorrect input
        self.histogram()  # Calling the function again.
    except ValueError:  # Except, in order to catch a value error.
      print("Please enter a number, not a string.")
      self.histogram()
    self.histogram_printing(user_cat, printing = True, saving = False)  # Calling the function below.
    saving = input("(Y/N) Do you want to download this histogram: ").title()
    reprint = input("(Y/N) Do you want to have the histogram printed again: ").title()  # 2 inputs that determine if the user wants to see the plots again or
    # save them.
    if reprint == "Y":
      self.histogram_printing(user_cat, printing = True, saving = False)  # Calling the function with different boolean values determined by the input.
    if saving == "Y":
      self.histogram_printing(user_cat, printing = False, saving = True)
  
  '''
  The histogram_printing function.
  
  This function uses the matplotlib import in order to create a histogram and then boolean based parameters dictate whether or not various functionality
  like saving or printing will be used.
  '''
  def histogram_printing(self, user_cat, printing, saving):  # Parameters are filled upon the function call, from the function above.
    plt.subplot()  # Initializing a plot.
    self.pd_data[self.headers[user_cat - 1]].hist()  # Classifying it as a histogram, while using the user's category as the data category shown.
    plt.suptitle(self.name.title() + " Histogram", fontsize=20, color = "white")  # Histogram titling with font size and colour being set.
    plt.tick_params(axis = "x", colors = "white")  # Changing the parameters for the x and y axis measurements, in order to mesh with the grey background.
    plt.tick_params(axis = "y", colors = "white")
    category = self.headers[user_cat - 1].replace("_", " ").title()  # Taking the user's category and removing underscores and capitalizing it to make the 
    # category more presentable.
    plt.xlabel(category, fontsize=18, color = "white")
    plt.ylabel("Apearances in Data Set", fontsize=18, color = "white")  # Label settings, now for the x and y axis names.
    if saving == True:  # If the parameter is set to True, then this section will be executed.
      filename = input("Enter in the file name: ")
      plt.savefig(filename + ".png", transparent = False, facecolor = "#383838")  # Saving the figure with the user's name and the same background colour
      # as the google colab console.
      upload(filename = filename, filetype = "png")  # Uploading to google drive, through a function.
    if printing == True:  # Dictating whether or not to show the plot to the user.
      plt.show()  # Showing all of the matplotlib plot that was made above.

      
'''
The USER class.

This is instantiated with the user's name. As well, the csv_importing function is called upon instantiation. This class is used to store class instances of
the Data and Model classes, along with the user's name.
'''
class User:
  def __init__(self, username):
    self.data_dict = {}  # Initializing a dictionary.
    self.model_dict = {}
    self.csv_importing()  # Calling this function, to instantiate the data set within the data_dict class.
    self.username = username  # The attribute that this class requires for instantiation.
  
  '''
  The menu function.
  This function uses if statements, while loops and try + except in order to act as a hub for all user actions within this program. Functions are called through
  the model or data dictionaries in order to access functions from the data or model classes.
  '''
  def menu(self):
    print("Hello " + str(self.username) + " Welcome to Leo's Machine Learning Companion.\nIt includes multiple features to help with linear modelling."
          "\nNote: plots, etc. are saved to the recent section of Google Drive.")
    print("The California data allows for you to model California housing price data from 1990 US Census.\nData is based on many blocks.")
    if self.data_dict["California Housing"].model == False:  # Checking to see if the class has been instantiated as a model.
      california_instance = Model(self.data_dict["California Housing"].name, self.data_dict["California Housing"].url, "California")  # Instantiating the Model
      # class, through attributes from the data instantiation, stored in self.data_dict.["California Housing"].
      self.model_dict["California Housing"] = california_instance  # Storing the class instance within the User class model_dict.
      self.data_dict["California Housing"].model = True  # Changing the boolean value to show that the data is instantiated.
    self.model_dict["California Housing"].user_name = self.username  # Ensuring that the model stores the user's name, in order for that to be included on
    # the statistics printer or saving funciton.
    while True:  # While loop.
      if self.data_dict["California Housing"].explained == False:  # If the data set has not bee explained.
        cat_explanation = input("(Y/N) Do you want an explanation of the data set categories: ").title()  # User input.
        if cat_explanation == "Y":  # If statement.
          self.data_dict["California Housing"].category_explaining()  # Calling the function through the data_dict stored class instance to explain the data.
          self.data_dict["California Housing"].explained = True
      if self.model_dict["California Housing"].scrubbed == False:  # Checking the boolean value to see if the data is scrubbed.
        print("You haven't scrubbed your data set yet. Data scrubbing randomizes data and prevents duplicates.")
        scrub = input("(Y/N) Do you want to scrub your data before proceeding: ").title()  # Asking the user.
        if scrub == "Y":
          self.model_dict["California Housing"].data_scrubbing()  # Calling the function through the Model class instance stored in model_dict.
        else:
          print("Continuing without scrubbing will compromise your model. It is highly recommended to select the 'Data Scrubbing' option below.")
      if self.model_dict["California Housing"].split == False:  # Same process as scrubbing.
        print("You haven't split your data set yet. Data splitting prevents overfit models.\nThis occurs when a model caters too much to one specific test.\n"
              "For example, it would be like studying for a test with the exact same set of multiple choice questions.\nThis would be detrimental if a variety"
              "of knowledge is required.(In this case, it is as a model must fit the data in a broad sense)")
        split = input("(Y/N) Do you want to split your data before proceeding: ").title()
        if split == "Y":
          self.model_dict["California Housing"].data_splitting()
        else:
          print("Continuing without splitting will compromise your data. It is highly recommended to select the 'Data Splitting' option below.")
      try:  # Try and except for the selection menu.
        user_selection = int(input("SELECTION MENU:\n1) Data Set Reading\n2) Data Histogram\n3) Data Scrubbing\n4) Data Splitting\n5) Machine Learning Model " 
                                   "Training\n6) Exit\nYour Input(As a number): "))  # User input, number based.
        if user_selection == 1:  # If statement.
          self.data_dict["California Housing"].data_printing()  # Based on selection, the function will be called either through the model_dict or data_dict
          # stored instances of the Model and Data classes respectively.
        elif user_selection == 2:
          self.data_dict["California Housing"].histogram()
        elif user_selection == 3:
          if self.model_dict["California Housing"].scrubbed == True:  # Checking to see if the data is scrubbed.
            print("The data is already scrubbed.")
            self.menu()  # Preventing the data from being scrubbed twice, which is pointless.
          self.model_dict["California Housing"].data_scrubbing()
        elif user_selection == 4:
          if self.model_dict["California Housing"].split == True:
            print("The data is already split.")
            self.menu()
          self.model_dict["California Housing"].data_splitting()
        elif user_selection == 5:
          self.model_dict["California Housing"].train_model()
        elif user_selection == 6:
          raise KeyboardInterrupt("Thanks for using Leo's Linear Machine Learning Companion.")  # Raising an error in order to exit the code. Also, the message
          # is displayed so that the user does not think that this is an unexpected error. Note: This was the only way I could find to exit the code within this
          # Jupyter Notebook which is running Python.
        else:  # An else statement to recall the menu if the user has not selected one of the options.
          self.menu()
      except ValueError:  # Except statement to catch string inputs.
        print("Please enter an integer, not a string.")
        self.menu()
  
  '''
  The csv_importing function.
  
  This function instantiates the Data class, and then stores it within the data_dict for it to be called upon through the menu.
  '''
  def csv_importing(self):
    class_instance = Data("California Housing", "https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv")  # Class instantiation.
    self.data_dict[class_instance.name] = class_instance  # Storing the class instance within the dictionary.

    
'''
The MODEL class.

The model class is initialized based upon input from the Data class, using inheritance. Many boolean values are included to store data and also to verify 
whether or not actions have been taken. These will be explained in the class initialization.
'''
class Model(Data):
  def __init__(self, name, url, model_name):
    super().__init__(name, url)  # Super() init to include inheritance, based upon the Data class.
    self.model_name = model_name  # The one new section added to this class, for differentiation.
    self.scrubbed = False  # A boolean which holds True or False depending on whether the data is scrubbed.
    self.split = False  # The same style of boolean as scrubbing, for splitting.
    self.train_test_ratio = None  # Storing a tuple of the percentages of training and test data.
    self.training_data = None  # Storing the entire training data set, filled through data splitting.
    self.testing_data = None  # Storing the test data set, also filled through data splitting.
    self.tested = False  # Verifying if the data is tested.
    self.trained = False  # Verifying if the data is trained.
    
  '''
  The data_scrubbing function.
  
  First, this checks if data was scrubbed. Next, the data is randomized and the label, median house value, is scaled which makes the numbers easier to work
  with.
  '''
  def data_scrubbing(self):
      if self.scrubbed == False:  # Checking if the data was scrubbed.
        '''
        imputer = SimpleImputer(missing_values = np.nan, strategy = "mean")  # This usage of the imputer would replace any empty values with the mean for that
        # column. However, this will not alter this data set as there are no empty cells. This is included to show the possibility of uses with other data sets.
        imputer = imputer.fit(self.pd_data[:, :-1])  # The first colon includes all lines, and the second section makes python account for all lines 
        # excluding the last, which is the label. The labels are not being changed because they must hold verified data.
        self.pd_data[:, :-1] = imputer.transform(self.pd_data[:, :-1])  # Using the imputer, which was set above the replace the values.
        '''  # This code would have been included in a proper data scrubbing. However, I could not make it work within this data set. I still felt it was
        # meaningful to include within the project.
        
        self.pd_data.reindex(np.random.permutation(self.pd_data.index))  # Using numpy to randomize the data.
        self.pd_data["median_house_value"] /= 1000.0  # Scaling the median house value target.
        print("\nThe data was scrubbed. The median house value was divided by 1000 for scaling. Data was also randomized.")  # Notifying the user.
        self.scrubbed = True  # Updating the boolean value to show that the data was scrubbed.
      else: # If scrubbed, the user is notified.
        print("The data was already scrubbed.")
  
  '''
  The feature_selection function. 
  
  This uses the data_categories() function to show the user their choices from the headers of this pandas data frame. Finally, it returns the user's choice.
  '''
  def feature_selection(self):
    print("\nA feature is a variable that describes data. For example, in a spam email reader, a feature could be the amount of words in the subject line.\n"
          "Below are the possible categories, barring median house value.")
    self.data_categories()  # Calling this function displays the categories to  the user.
    while True:  # While loop, to ensure that there is a proper answer.
      try:
        user_cat = int(input("\nYour Input(As a number): "))
        if 1 <= user_cat <= int(len(self.headers) - 1):  # Sensing based upon the length of the headers, or categories.
          print("You selected the feature " + self.headers[user_cat - 1].replace("_", " ") + ".")  # Notifying the user.
          break  # Breaking the loop.
        else:
          print("Please enter a valid number in order to pick a feature. Note: You cannot select house price as this is the label.")
          continue  # Invalid input, loop restarts.
      except ValueError:
        print("Please enter a number, not a string.")
        continue  # String input, caught through try and except which restarts the loop.
    return self.headers[user_cat - 1]  # The user's category or feature is returned.
      
  '''
  The data_splitting function.
  
  This function determines the percentage of testing data. Using this percentage, and a the sckit train_test_split module, the data is split into these two
  sets.
  '''
  def data_splitting(self):
    if self.split == False:  # Checking if the data is split already. If not, the data will be split below.
      print("\nA good data splitting ratio is important and further decreases the chances of overfitting.\nA strong percentage of testing data is from"
            " 15-25 percent.")
      print("Note: This percentage will be rounded to 1 decimal place.")
      try:  # Try and except for a string input. This must be an integer.
        scrubbing_percentage = int(input("What percentage of the data should be used for testing: %"))
        round(scrubbing_percentage, 1) # Rounding to 1 decimal place.
      except ValueError:
        print("Please enter a number.")
        self.data_splitting()
      if not 15 <= scrubbing_percentage <= 25:  # Notifying the user that their input is outside of the suggested data splitting range.
          outside_range = input("You have entered a number outside of the suggested percentage range, from 15-25.\nWould you like to continue: (Y/N)").title()
          if outside_range != "Y":
            print("Restarting this function, in order to select a different percentage for testing.")
            self.data_splitting()  # Recalling the function.
      elif scrubbing_percentage >= 100:  # More conditions if the numbers entered are outside of the possible range.
        print("99 is the absolute maximum for test data division.\nNote: This will warrant a weak model. However, this limit was exceeded in the selection."
              "process will restart.")
        self.data_splitting()
      elif scrubbing_percentage <= 0.9:
        print("1 is the absolute minimum for test data division.\nNote: This will warrant a weak model. However, the selection is less than this limit."
              "The selecting process will restart.")
        self.data_splitting()
      self.training_data, self.testing_data = train_test_split(self.pd_data, test_size = (scrubbing_percentage / 100))  # Using the sckit module 
      # in order to split the pandas dataframe into two, which is done through the comma based inclusion of self.training and self.testing data on the left.
      self.split = True  # Verifying that the data is split.
      self.train_test_ratio = (scrubbing_percentage, (100 - scrubbing_percentage))  # A tuple of the percentage of testing data then training data.
    else:
      print("The data was already split.")  # If the data is split already, the user is notified.
    
  '''
  The train_model function.
  
  This function is the core of this code. It includes a TensorFlow linear regressor and integration of matplolib to represent the data. Parameters are 
  entered in as None(barring batch_size which is restricted to make sure that the gradient descent is stochastic, using one batch). Most of these 
  hyperparameters dictate the results of the regressor. However, some like the saving boolean determine whether the plots are being saved or not.
  '''
  def train_model(self, learning_rate = None, steps = None, batch_size = 1, periods = None, training_or_test = None, saving = None, my_feature = None):
    if learning_rate == None:  # If the parameter is not included in the function call, then the value must be set.
      print("Learning rate determines the magnitude of a gradient step. It must be a positive float over 0.")
      try:
        learning_rate = float(input("Select the learning rate: "))  # Float allows for decimal values to be included, rather than an integer.
        if learning_rate <= 0:
          self.train_model()  # Recalling the function if the learning rate is too low.
      except ValueError:  # Preventing a string input.
        self.train_model()
    if steps == None:  # Similar input process for data set steps.
      print("Steps are the amount of times the model changes, through input based upon other parameters.")
      try:
        steps = float(input("Select the number of steps: "))
        if not steps > 1:
          self.train_model(learning_rate = learning_rate)
      except ValueError:
        self.train_model(learning_rate = learning_rate)
    print("Batch size is the amount of data which is used in gradient descent.\Stochastic gradient descent uses one batch.\nMini-batch sizes range is 10-1K.")
    print("For this modelling program, batch size will be 1 in order to meet the space limit of 2GB for the TensorFlow linear regressor.")  # Notifying the user
    # of the selected batch size and reasoning.
    if periods == None:  # Similar process for filling in periods.
      print("Periods are the amount of times that a step/s(based on the value specified above) is/are taken.")
      try:
        periods = int(input("Select the number of periods: "))
        if not periods > 1:
          self.train_model(learning_rate = learning_rate, steps = steps, batch_size = batch_size)  # This includes a function call with the above parameters
          # filled in. In this case, the periods parameter was entered correctly. Calling the function in this manner with the integration of None values
          # prevents the user from needing to enter in other parameters twice.
      except ValueError:
        self.train_model(learning_rate = learning_rate, steps = steps, batch_size = batch_size)
    if self.split == False:  # Checking again to see if the user wants to split their data.
      print("You still have the option to split your data.")
      split = input("(Y/N) Do you want to split this data: ").title()
      if split == "Y":
        self.data_splitting()  # If so, their data is split here.
    if self.scrubbed == False:  # Same thing with data scrubbing.
      print("You still have the option to scrub your data.")
      scrub = input("(Y/N) Do you want to scrub this data: ").title()
      if scrub == "Y":
        self.data_scrubbing()
    print("In terms of data, you have the option of modelling training or test data.")
    if self.split == True:  # If the data is split, then the user decides between training or test data.
      while True:
        if self.trained == False:  # Checking if the training data is used yet.
          print("You have not trained the data yet. It is recommended you do this first.")  # Notifying the user.
        if training_or_test == None:  # If there is no input for this parameter, the input is given below.
          training_or_test = input("(Training/Test) Do you want to use training data for the linear modelling: ").lower()
        if training_or_test == "training" or training_or_test == "test":  # Otherwise, the information from the function call is verified and the loop ends.
            break  # Breaking the loop after correct user input.
    elif self.split == False:  # If the data is not split.
      training_or_test = "all data" # If the user has not split their data, then this is shown while modelling.
    if my_feature == None:
      my_feature = self.feature_selection()  # Calling the feature_selection() function for the user to pick a feature.
    my_label = "median_house_value"  # The label is selected. In this case, I restricted the label/target of the linear regressor to only median house value.
    if training_or_test.lower() == "training":
      targets = self.training_data[my_label]  # Setting the targets and feature data based upon class stored training data.
      my_feature_data = self.training_data[[my_feature]]
    elif training_or_test.lower() == "test":
      targets = self.testing_data[my_label]  # Setting the targets and feature data from the class based testing data.
      my_feature_data = self.testing_data[[my_feature]]
    elif training_or_test == "all data":
      print("You chose not to split your data. This is inadvisable, but the entire data set will now be modelled.")
      targets = self.pd_data[my_label]  # Setting targets and features based on all data.
      my_feature_data = self.pd_data[[my_feature]]
    feature_columns = [tf.feature_column.numeric_column(my_feature)]
    steps_per_period = steps / periods
    training_input_fn = lambda:self.my_input_fn(my_feature_data, targets, batch_size)  # Using lambda to enable the my_input_fn to be called.
    prediction_input_fn = lambda: self.my_input_fn(my_feature_data, targets, num_epochs=1, shuffle = False, batch_size = batch_size)  # Lambda is used again
    # in order to have a prediction function, in contrast with the training input function.
    my_optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate) 
    my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
    linear_regressor = tf.estimator.LinearRegressor(feature_columns = feature_columns, optimizer = my_optimizer)  # Creating the TensorFlow linear regressor
    # object, based upon the two above variables.
    plt.figure(figsize = (15, 6))  # Matplotlib code to initialize a plot with dimensions.
    plt.subplot(1, 2, 1)
    plt.title("Learned Line by Period", color = "white", fontsize = 20)  # Plot title.
    plt.tick_params(axis = "x", colors = "white")  # X and Y ticks are set to white colour for ease of reading within dark mode.
    plt.tick_params(axis = "y", colors = "white")
    plt.ylabel(my_label.replace("_", " ").title(), color = "white", fontsize = 18)  # Labels are set.
    plt.xlabel(my_feature.replace("_", " ").title(), color = "white", fontsize = 18)
    sample = self.pd_data.sample(n = 300)  # Data is sampled, in order to fill the scatter plot.
    plt.scatter(sample[my_feature], sample[my_label])  # A scatter plot is made with the features and labels as the two things in relation.
    colors = [cm.coolwarm(x) for x in np.linspace(-1, 1, periods)]  # Setting the line colour warmest with the least RMSE.
    print("Training model...")
    print("RMSE (on " + training_or_test.lower() + " data):")
    root_mean_squared_errors = []  # Creating an empty list.
    for period in range (0, periods):  # For loop, reiterated for each period.
      linear_regressor.train(input_fn=training_input_fn, steps=steps_per_period)  # Using the linear regressor object to make predictions below.
      predictions = linear_regressor.predict(input_fn=prediction_input_fn)
      predictions = np.array([item['predictions'][0] for item in predictions])  # List comprehensions are used to turn the predictions into a numpy array.
      root_mean_squared_error = math.sqrt(metrics.mean_squared_error(predictions, targets))  # RMSE is calculated with the math import.
      print("  period %02d : %0.2f" % (period, root_mean_squared_error))  # Period number and RMSE is shown, using string formatting.
      root_mean_squared_errors.append(root_mean_squared_error)  # The individual RMSE is appended to the list.
      y_extents = np.array([0, sample[my_label].max()])
      weight = linear_regressor.get_variable_value('linear/linear_model/%s/weights' % my_feature)[0]
      bias = linear_regressor.get_variable_value('linear/linear_model/bias_weights')
      x_extents = (y_extents - bias) / weight
      x_extents = np.maximum(np.minimum(x_extents, sample[my_feature].max()), sample[my_feature].min())
      y_extents = weight * x_extents + bias
      plt.plot(x_extents, y_extents, color=colors[period]) # The extents are mathematical inputs which plot the lines neatly. I do not have the expertise to 
      # explain why these are useful or why they plot the lines in this way.
    print("Model training finished.")
    plt.subplot(1, 2, 2)  # Initializing another plot.
    plt.ylabel("RMSE", color = "white", fontsize = 18)
    plt.xlabel("Periods", color = "white", fontsize = 18)
    plt.title("Root Mean Squared Error vs. Periods", color = "white", fontsize = 20)
    plt.tick_params(axis = "x", colors = "white")
    plt.tick_params(axis = "y", colors = "white")
    plt.tight_layout()
    plt.plot(root_mean_squared_errors)  # Plotting the RMSE list over time.
    calibration_data = pd.DataFrame()
    calibration_data["predictions"] = pd.Series(predictions)
    calibration_data["targets"] = pd.Series(targets)  # These three lines are setting the table with various statistics relating prediction data to target data.
    print("Final RMSE (on training data): %0.2f" % root_mean_squared_error)
    if saving == True:
      filename = input("Enter in the file name for the plots: ")
      plt.savefig(filename + ".png", transparent = False, facecolor = "#383838")  # Saving the plots.
      upload(filename = filename, filetype = "png")  # Using the uploading function.
      print("The charts will be shown again.")
    display.display(calibration_data.describe())
    plt.show()  # For some reason, the matplotlib plots cannot be saved after they are shown. Therefore, the user will need to see the same plot twice
    # in order to save it to their google drive.
    if saving == None:  # Again, using the parameter to track user inputs.
      statistics = input("(Y/N) Do you want to save your statistics from this data as a TXT file: ").title()
      if statistics == "Y":
        self.statistics_saving(my_feature, my_label, learning_rate, steps, batch_size, periods, root_mean_squared_error, root_mean_squared_errors)  # Calling
        # the statistics_saving function, ensuring to include all parameters so the data can be turned into a TXT file.
      saving = input("(Y/N) Do you want to save these charts: ").title()
      if saving == "Y":
        print("The function must be called again in order to save the data.")
        self.train_model(learning_rate = learning_rate, steps = steps, batch_size = 1, periods = periods, training_or_test = training_or_test, saving = True
                        , my_feature = my_feature)  # Calling the function again, with the same parameters and the saving boolean set to True, in order to 
        # fit the above condition(if saving == True) on the next function call.
    if training_or_test == "training":
      self.trained = True  # This is the end of the function, so at this point the data is trained if the user had selected training data.
    elif training_or_test == "test":
      self.tested = True  # The data is tested at this point, if that was the user's selection.
    if self.trained == True and self.tested == False:  # If the data is trained, but not tested. I wanted to include a way for the exact same settings to be 
      # used seamlessly with test data. This takes the training parameters and calls the same function.
      test = input("(Y/N) Do you want to model your testing data with the same settings: ").title()
      if test == "Y":
        self.train_model(learning_rate = learning_rate, steps = steps, batch_size = 1, periods = periods, training_or_test = "test", saving = True, my_feature
                         = my_feature)  # Calling the function with test data now, as specified in the parameters.
  
  '''
  The my_input_fn function.
  
  This function creates slices(which I am not sure of the meaning) in order to get the next data point from the data set. The next data points, or features
  and labels, are returned by this function.
  '''
  @ staticmethod  # Note: self is not included in the function, so it is a staticmethod.
  def my_input_fn(features, targets, batch_size, shuffle=True, num_epochs=None):  # Parameters are specified.
    features = {key:np.array(value) for key,value in dict(features).items()} # Looping through a dictionary in order to have feature data.                                           
    ds = Dataset.from_tensor_slices((features,targets))
    ds = ds.batch(batch_size).repeat(num_epochs)  # I am not sure fully of how these two lines work. I know that the data is split into slices, through
    # tensorflow.
    if shuffle:  # If the parameter is specified, the data set is shuffled.
      ds = ds.shuffle(buffer_size=10000)
    features, labels = ds.make_one_shot_iterator().get_next()  # This get_next() function gets the next data point which is then used in the linear regression.
    # Note: This function works only with the train_model function, mostly in lambda.
    return features, labels  # Returning the next set of features and labels.
  
  '''
  The statistics_saving function.
  
  This function is the core of the companion section for linear regression. In my opinion, this is the most helpful function within the entire program.
  Many useful data points are specified and put into a TXT file, which is exported to google drive. These will be explained below. These do not need to be
  memorized now, as they can all be saved in Google Drive.
  '''
  def statistics_saving(self, my_feature, my_label, learning_rate, steps, batch_size, periods, root_mean_squared_error, root_mean_squared_errors):  # Many
    # parameters include self for class based data. As well, parameters from the train_model funciton and data points are specified to ensure that they are 
    # translated to the text file.
    fname = input("What should the file be called: ")
    working_file = open(fname + ".txt", "w")  # Opening the file, using file IO based upon user file name input.
    working_file.write("User Name: " + str(self.user_name))  # Specifying user name. This is why the user's name was stored within the data class, as I needed
    # to access it here for storage in the file.
    working_file.write("\nData Set: " + str(self.name))  
    working_file.write("\nData Set URL: " + str(self.url))  # Using the calss data to specify the data set name and url.
    working_file.write("\nData Set Feature Used: " + str(my_feature.replace("_", " ").title()))
    working_file.write("\nData Set Label Used: " + str(my_label.replace("_", " ").title()))  # Using the my_feature and my_label values to show the user which
    # they used.
    working_file.write("\nLinear Regressor Learning Rate: " + str(learning_rate))
    working_file.write("\nLinear Regressor Steps: " + str(steps))
    working_file.write("\nLinear Regressor Batch Size: " + str(batch_size))
    working_file.write("\nLinear Regressor Periods: " + str(periods))  # These four parameters are specified. I feel that these are important to remember to 
    # track testing data. It is important to document these various settings, features and labels in order to track progress.
    if self.split == True:  # If the data is split, then the train_test_ratio tuple is used to show each percentage.
      working_file.write("\nPercentage of Testing Data Division: %" + str(self.train_test_ratio[0]))  # Acessing the tuple.
      working_file.write("\nPercentage of Training Data Division: %" + str(self.train_test_ratio[1]))
    working_file.write("\nRMSE by Period: ")  # Next, the RMSE Section.
    for x in range(len(root_mean_squared_errors)):  # For loop, looping through RMSE list.
      working_file.write("\nPeriod " + str(x) + " : "  + str(root_mean_squared_errors[x]) )  # Including period number and the RMSE at that point.
    working_file.write("\nFinal RMSE: " + str(root_mean_squared_error))  # Showing final RMSE.
    working_file.close()  # Closing the file.
    upload(filename = fname, filetype = "txt")  # Using the upload function.


'''
The upload function.

This function is not within a class, but is extremely useful. It uses google drive to upload files into a user's recent files.
'''
def upload(filetype, filename):  # Parameters include the file type and the file's name.
  uploaded = drive.CreateFile({"title": filename + "." + filetype})
  uploaded.SetContentFile(filename + "." + filetype)
  uploaded.Upload()  # I am not able to explain the way that this module works. However, I can explain that the file name and type are utilized along with the
  # pydrive and google drive authorization in order to upload files.
  print('Uploaded file with ID {}'.format(uploaded.get('id')))  # Notifying the user that their file is uploaded.

  
'''
The user_creation function.

This function is very simple. It welcomes the user, and then instantiates the User class after gathering their name. Next, this calls the menu to start the 
program.
'''
def user_creation():
  print("Welcome to Leo's Machine Learning Companion")  # Notifying the user that they started the code.
  username = input("Please enter your name: ")  # User name input.
  user = User(username)  # Class instantiation.
  user.menu()  # Calling the menu function from the user class.

user_creation()  # Calling the function which allows me to instantiate the user class and call the menu.