### Import required packages

In [1]:
import numpy as np
import pandas as pd
from os import path
import warnings
warnings.filterwarnings("ignore")

### Open the corresponding dataset based on the file encoding type and extension

In [2]:
#Open corresponding file
filename = "tweets"
file_format = filename+'.csv'
filename = file_format
encoding = ['utf8','latin1','iso-8859-1','cp1252']
flag = False
encoding_list = len(encoding)
if file_format.endswith('.html'):
    dfs = pd.read_html(file_format)
    df  = pd.concat(dfs)
    df.to_csv(filename+".csv",index=False)
if file_format.endswith('.json'):
    df = pd.read_json(filename)
if 'parsed.csv' in filename:
    df = pd.read_csv(file_format, index_col=0 )
    flag = True
if flag != True:
#Try all the possible encoding schemes to see which one applies to the dataset
    for encoding_type in range(0,encoding_list-1):
        try:
            if file_format.endswith('.csv'):
                df = pd.read_csv(file_format, index_col=None , encoding = encoding[encoding_type])
                break
        except Exception:
            pass

### Display the top 5 contents of the dataset

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,favorited,favoriteCount,replyToSN,created,truncated,replyToSID,id,replyToUID,statusSource,screenName,retweetCount,isRetweet,retweeted,longitude,latitude
0,1,American Harem.. #MeToo https://t.co/HjExLJdGuF,False,0.0,,11/29/17 23:59,False,,9.36e+17,,"<a href=""http://instagram.com"" rel=""nofollow"">...",ahmediaTV,0,False,False,,
1,2,@johnconyersjr @alfranken why have you guys ...,False,0.0,johnconyersjr,11/29/17 23:59,False,,9.36e+17,266150000.0,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",JesusPrepper74,0,False,False,,
2,3,Watched Megan Kelly ask Joe Keery this A.M. if...,False,0.0,,11/29/17 23:59,True,,9.36e+17,,"<a href=""http://twitter.com/download/android"" ...",DemerisePotvin,0,False,False,,
3,4,Women have been talking about this crap the en...,False,0.0,,11/29/17 23:59,False,,9.36e+17,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",TheDawnStott,0,False,False,,
4,5,.@BetteMidler please speak to this sexual assa...,False,15.0,,11/29/17 23:59,False,,9.36e+17,,"<a href=""http://twitter.com/#!/download/ipad"" ...",scottygirl2014,11,False,False,,


### Data cleaning methods - Drop, rename and selecting subset of columns

In [14]:
class cleanData:
    
    #Initialize the dataset that is used to apply the relevant cleaning opeea
    def __init__(self, df): 
        self.df = df 

    def cleanColumnData(self):
        data_types = []
        data_types.append(self.df.dtypes)
        #convert all column names into lowercase for uniformity and remove whitespaces
        self.df.columns = self.df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
        #replace the cells without any data in the column with the value ‘None’
        df.loc[:, df.dtypes == object].replace(np.nan,'',regex = True, inplace=True)
        return data_types
    
    def dropColumns(self):
        if choice == 'yes' or choice == 'y' or choice == '':
            colToDrop=[]
            DroppedColumns = []
            dropCount = 0
            try:
                print("The index values and the column names are listed below:")
                #Lists all the columns in a dataset along with the index
                for index,column in enumerate(df.loc[:]):
                    print(index,column)
                print("\n")
                colToDrop = df.columns
                while True:
                    ch = input("Enter the column index you want to drop or Press enter to exit\n")
                    #Checks if the user input is numeric
                    if ch.isnumeric():
                        df.drop(colToDrop[int(ch)], inplace=True, axis=1)
                        dropCount += 1
                    elif ch == '':
                        #Drop count is set to print the corresponding print statements. 
                        if dropCount > 0:
                            print("Done dropping the columns")
                        else:
                            print("Exiting the function")
                        break
            except ValueError as e:
                    print("Value error {} occured".format(e))
        else:
            print("Enter a valid choice")

    def rename(self):
        colToPlot=[]
        RenamedColumns = []
        renameCount = 0
        try:
            print("The index values and the column names are listed below:")
            #Lists all the columns in a dataset along with the index
            for index,column in enumerate(self.df.loc[:]):
                print(index,column)
            print("\n")
            colToPlot = self.df.columns
            while True:
                ch = input("Enter the column index you want to rename or Press enter to exit\n")
                if ch.isnumeric():
                    new_name = input("Enter the new name for the column {}:\n".format(colToPlot[int(ch)]))
                    #Append the columns to be renamed in a new list
                    RenamedColumns.append(new_name)
                    renameCount += 1
                elif ch == '':
                    if renameCount > 0:
                        print("Done renaming")
                    else:
                        print("Exiting the function")
                    break
            # Rename takes 2 arguments , the list of columns to be renamed and the new column name
            for (a, b) in zip(colToPlot, RenamedColumns): 
                 self.df.rename(columns={a:b}, inplace=True)
        except ValueError as e:
                print("Value error {} occured".format(e))
    
    def select_subsetOfColumns(self):
        for index,column in enumerate(df.columns):
                print(index,column)
        #Lists all the columns in a dataset along with the index
        NumberOfIndex = int(input("Enter the total number of column indices you want to slice \n"))
        ColumnIndexToBeModified = []
        for i in range(0, NumberOfIndex):
            Number = int(input("Enter the index \n"))
            #Append the column index to be sliced in a new list
            ColumnIndexToBeModified.append(Number)
        #Loops and stores the sliced column data into a new dataframe when returned
        columns_to_slice = [df.columns[i] for i in ColumnIndexToBeModified]       
        modifiedData = df[columns_to_slice]
        return modifiedData

### Creating an instance for the data clean class

In [15]:
#Creating an instance for the data clean class 
cleanData_object = cleanData(df)

### Invoke the relevant data cleaning methods

In [16]:
def invokeCleaningMethod():
    choiceCount = 0
    choice = input("Do you want to rename column names? Type Yes or no\n")
    choice.lower()
    if choice == 'yes' or choice == 'y' or choice == '':
        cleanData_object.rename()
    elif choice == 'no' or choice == 'n':
        choiceCount += 1
        pass
    else:
        print("Enter a valid choice")
    choice = input("Do you want to drop any columns? Type Yes or no\n")
    choice.lower()
    if choice == 'yes' or choice == 'y' or choice == '':
        cleanData_object.dropColumns()
    elif choice == 'no' or choice == 'n':
        choiceCount += 1
        return choiceCount
        pass
    else:
        print("Enter a valid choice")

In [17]:
#Invoke relevant methods for data preprocessing based on user input
try:
    #Invoke cleanData method by default
    cleanData_object.cleanColumnData()
    choice = input("Do you want to perform drop or rename operations on any columns? Type Yes or no\n")
    choice.lower()
    if choice == 'yes' or choice == 'y' or choice == '':
        count = invokeCleaningMethod()
        if count != 2:
            choice = input("Do you want to perform any additional drop, rename or subset operations? Type Yes or no\n")
            choice.lower()
            if choice == 'yes' or choice == 'y' or choice == '':
                invokeCleaningMethod()
            elif choice == 'no' or choice == 'n':
                pass
            else:
                print("Enter a valid choice")
        else:
            pass
    elif choice == 'no' or choice == 'n':
        pass
    else:
        print("Enter a valid choice")
    
except ValueError as e:
    print("Value error {} occured".format(e))           

Do you want to perform drop or rename operations on any columns? Type Yes or no
n


### Display the dataframe after rename and drop operations

In [11]:
df

Unnamed: 0,unnamed:_0,text,favorited,favoritecount,replytosn,created,truncated,replytosid,id,replytouid,statussource,screenname,retweetcount,isretweet,retweeted,longitude,latitude
0,1,American Harem.. #MeToo https://t.co/HjExLJdGuF,False,0.0,,11/29/17 23:59,False,,9.360000e+17,,"<a href=""http://instagram.com"" rel=""nofollow"">...",ahmediaTV,0,False,False,,
1,2,@johnconyersjr @alfranken why have you guys ...,False,0.0,johnconyersjr,11/29/17 23:59,False,,9.360000e+17,2.6615e+08,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",JesusPrepper74,0,False,False,,
2,3,Watched Megan Kelly ask Joe Keery this A.M. if...,False,0.0,,11/29/17 23:59,True,,9.360000e+17,,"<a href=""http://twitter.com/download/android"" ...",DemerisePotvin,0,False,False,,
3,4,Women have been talking about this crap the en...,False,0.0,,11/29/17 23:59,False,,9.360000e+17,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",TheDawnStott,0,False,False,,
4,5,.@BetteMidler please speak to this sexual assa...,False,15.0,,11/29/17 23:59,False,,9.360000e+17,,"<a href=""http://twitter.com/#!/download/ipad"" ...",scottygirl2014,11,False,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393130,393131,RT @Suffragentleman: You can only choose one.....,FALSE,0.0,,12/25/17 0:00,False,,9.450820e+17,,"<a href=""http://twitter.com/download/android"" ...",boaomega22,616,True,False,,
393131,393132,"#MeToo, say victims of sexual harassment in Ja...",FALSE,0.0,,12/25/17 0:00,False,,9.450820e+17,,"<a href=""http://bufferapp.com"" rel=""nofollow"">...",April_Magazine,0,False,False,,
393132,393133,Susan Collins tries to #MeToo her way out of h...,FALSE,0.0,,12/25/17 0:00,False,,9.450820e+17,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Noofer55,0,False,False,,
393133,393134,RT @OneMillionVjj: Punish those who choose not...,FALSE,0.0,,12/25/17 0:00,False,,9.450820e+17,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",ZBezzt,5,True,False,,


### Selecting subsets of columns

In [19]:
choice = input("Do you want get a subset of columns? Type Yes or no\n")
choice.lower()
if choice == 'yes' or choice == 'y' or choice == '':
    #Invoke subset column method based on user choice and store the sliced column data in a new dataframe df1 to avoid conflicts
    df1 = cleanData_object.select_subsetOfColumns()
elif choice == 'no' or choice == 'n':
    pass
else:
    print("Enter a valid choice")

Do you want get a subset of columns? Type Yes or no
y
0 unnamed:_0
1 text
2 favorited
3 favoritecount
4 replytosn
5 created
6 truncated
7 replytosid
8 id
9 replytouid
10 statussource
11 screenname
12 retweetcount
13 isretweet
14 retweeted
15 longitude
16 latitude
Enter the number of column index you want to slice 
2
Enter the index 
4
Enter the index 
11


### Placing the cleaned dataset in a new file

In [20]:
#Saving the cleaned dataset in a new CSV file based on user choice
choice = input("Do you save the above changes to a new csv file? Type Yes or no\n")
choice.lower()
if choice == 'yes' or choice == 'y' or choice == '':
    #Stores the cleaned/ sliced dataset with the suffix '_cleaned'/ '_subsetData' in the current working directory
    df.to_csv(filename+"_cleaned.csv",index=False)
    try:
        df1.to_csv(filename+"_subsetData.csv",index=False)
    except:
        pass
elif choice == 'no' or choice == 'n':
    pass
else:
    print("Enter a valid choice")


Do you save the above changes to a new csv file? Type Yes or no
y
