In [None]:
from nltk.corpus import stopwords
import pickle
import string
import re
import pandas as pd
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn import tree
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
import scipy.sparse as sp
import matplotlib.pyplot as plt


def init():
    """
    This method is used to read and learn about our dataset.
    Also, empty rows are dropped and the rows with target class value missing are dropped too.
    :return: The data set to be used for Predictive Analysis
    """

    # Reading the data from the CSV file using the latin1 encoding.
    data_read = pd.read_csv("gender-classifier-DFE-791531 2.csv", encoding='latin1')  # Dataset Size = 20050

    # If all the attribute values are empty for any of the rows, we drop them.
    data = data_read.dropna(how='all')          # After dropping, data set size is still 20050

    # Checking the names of the columns/attributes which contains at least one null value
    columns_containing_missing_values = data.columns[data.isnull().any()].tolist()
    print("Column names which has missing values")
    print(columns_containing_missing_values)

    # Since 'gender' is our target variable, we would like to have values for it.
    # So, dropping all the rows which have no values for the 'gender' attribute.
    data = data[data['gender'].notnull()]       # After dropping, dataset size = 19953 rows
    # Also, dropping all the rows which have values as 'unknown' for the 'gender' attribute
    data = data[data['gender'] != 'unknown']    # After dropping, dataset size = 18836 rows

    male_profile_count = len(data[data['gender'] == 'male'])
    print("Male Profile Count " + str(male_profile_count))
    female_profile_count = len(data[data['gender'] == 'female'])
    print("Female Profile Count " + str(female_profile_count))
    brand_profile_count = len(data[data['gender'] == 'brand'])
    print("Brand Profile Count " + str(brand_profile_count))

    return data


def clean_text(text):
    """
    This method is used to clean the text involved in our data set which is mainly the tweet and profile description
    which are known by the attribute names 'text' and 'description'.
    Cleaning involves converting all text to lower case, stripping all punctuation strings before or after every word,
    removing a HTML code, whitespaces, special characters and numbers found in between the text.
    :param text: A string
    :return: Cleaned string
    """
    text = str(text).lower()
    text = text.strip(string.punctuation)
    text = re.sub("&amp;", '', text)
    text = re.sub("https", '', text)
    text = re.sub('\W\s', '', text)
    text = re.sub('\s,\W', '', text)
    text = re.sub('[.!@#$%^&*()_,:;/-]', '', text)
    text = re.sub("\d+", '', text)

    return text


def check_confidence(data):
    """
    This method is used to filter the data based on the gender confidence.
    Gender confidence provides the percentage of confidence the judges had while manually assigning the
    genders to the twitter profiles. So, we consider only those observations in which the gender confidence
    is 100% or 1.
    :param data: The data set to be filtered
    :return: Filtered data set.
    """
    gender_confident_data = data[data['gender:confidence'] == 1]  # Dataset size = 13926
    return gender_confident_data


def split_dataset_for_one_attribute(gender_confident_data, attribute='text_cleaned'):
    """
    This method is used to separate the input and output attributes and divide the data sets into training and testing
    data sets. Also, we only consider one of the attributes as input like the tweet or the profile description
    to determine the gender.
    :param gender_confident_data: The data which has 100% confident gender labels.
    :param attribute: The attribute using which classification model has to be trained and tested.
    :return: The training and testing data sets.
    """

    # Converting text strings into a matrix of word token counts.
    cv = CountVectorizer()
    inputString = cv.fit_transform(gender_confident_data[attribute])
    #cv = CountVectorizer()
    #inputString = cv.fit_transform(gender_confident_data['sidebar_color'])
    #print(inputString[1])
    
    # Encodes class labels from 0 to Num_of_classes-1
    le = LabelEncoder()
    outputString = le.fit_transform(gender_confident_data['gender'])

    # Splitting the data such that 66% of the data is assigned as training data and the rest as the test data set.
    input_train, input_test, output_train, output_test = train_test_split(inputString, outputString, train_size=0.66)

    return input_train, output_train, input_test, output_test


def combine_tweet_and_description(data):
    """
    This method combines the text from both user's tweet and profile description and forms a new feature
    to be used for classification of user's gender. Also, uses this feature and splits the data set into training
    and testing sets.
    :param data: The data set
    :return: Training and Testing data sets.
    """
    data['combined_text'] = data['text_cleaned'].str.cat(data['description_cleaned'], sep='')
    gender_confident_data = check_confidence(data)

    # Converting text strings into a matrix of word token counts.
    cv = CountVectorizer()
    inputString = cv.fit_transform(gender_confident_data['combined_text'])

    # Encodes class labels from 0 to Num_of_classes-1
    le = LabelEncoder()
    outputString = le.fit_transform(gender_confident_data['gender'])

    # Splitting the data such that 66% of the data is assigned as training data and the rest as the test data set.
    input_train, input_test, output_train, output_test = train_test_split(inputString, outputString, train_size=0.66)
    print(input_train[0])
    return input_train, output_train, input_test, output_test


def select_important_features(data):
    """
    This method uses 5 most relevant user profile features that help determine the gender of twitter users.
    We consider the tweet, profile description, sidebar color, link color and profile names.
    :param data: Data set
    :return: Training and Testing data sets.
    """

    selected_attributes = ['text_cleaned', 'description_cleaned', 'sidebar_color', 'link_color', 'name']
    filtered_data = pd.DataFrame(data, columns=selected_attributes)
    output_data = data['gender']

    # Converting text strings into a matrix of word token counts
    cv = CountVectorizer()
    inputString = sp.hstack(filtered_data.apply(lambda attribute: cv.fit_transform(attribute)))

    # Encodes class labels from 0 to Num_of_classes-1
    le = LabelEncoder()
    outputString = le.fit_transform(output_data)

    # Splitting the data such that 66% of the data is assigned as training data and the rest as the test data set.
    input_train, input_test, output_train, output_test = train_test_split(inputString, outputString, train_size=0.66)
    return input_train, output_train, input_test, output_test


def select_features(data,attributes):
    """
    This method uses 4 most relevant user profile features that help determine the gender of twitter users.
    :param data: Data set
    :return: Training and Testing data sets.
    """
    filtered_data = pd.DataFrame(data, columns=attributes)
    output_data = data['gender']

    # Converting text strings into a matrix of word token counts
    cv = CountVectorizer()
    inputString = sp.hstack(filtered_data.apply(lambda attribute: cv.fit_transform(attribute)))

    # Encodes class labels from 0 to Num_of_classes-1
    le = LabelEncoder()
    outputString = le.fit_transform(output_data)

    # Splitting the data such that 66% of the data is assigned as training data and the rest as the test data set.
    input_train, input_test, output_train, output_test = train_test_split(inputString, outputString, train_size=0.66)
    return input_train, output_train, input_test, output_test
    

def train_and_test_model(In_train, Out_train, In_test, Out_test):
    """
    This method is used to train and test different classifier models for the provided training and testing data sets.
    It also prints out the accuracy and confusion matrices for each classifier.
    :param In_train: Inputs for training
    :param Out_train: Target values for Training
    :param In_test: Inputs for Testing
    :param Out_test: Target values for Testing
    :return: The trained Naive Bayes Classifier for saving the model.
    """

    # Naive Bayes Classifier
    print("Naive Bayes")
    NB_classifier = MultinomialNB()
    NB_classifier.fit(In_train, Out_train)
    predictions = NB_classifier.predict(In_test)
    print(NB_classifier.score(In_test, Out_test))
    print()



    # Decision Tree Classifier
    print("Decision Tree")
    DT_classifier = tree.DecisionTreeClassifier()
    DT_classifier.fit(In_train, Out_train)
    predictions = DT_classifier.predict(In_test)
    print(DT_classifier.score(In_test, Out_test))
    print()


    # Support Vector Machines
    print("Support Vector Machines")
    SVM_Classifier = svm.SVC()
    SVM_Classifier.fit(In_train, Out_train)
    predictions = SVM_Classifier.predict(In_test)
    print(SVM_Classifier.score(In_test, Out_test))
    print()

    return NB_classifier



def main():
    """
    The main method.
    :return: None
    """

    data = init()
    data['text_cleaned'] = [clean_text(tweet) for tweet in data['text']]
    data['description_cleaned'] = [clean_text(line) for line in data['description']]
    print(list(data))
    gender_confident_data = check_confidence(data)

    male_profile_count = len(gender_confident_data[gender_confident_data['gender'] == 'male'])
    print("Male Profile Count " + str(male_profile_count))
    female_profile_count = len(gender_confident_data[gender_confident_data['gender'] == 'female'])
    print("Female Profile Count " + str(female_profile_count))
    brand_profile_count = len(gender_confident_data[gender_confident_data['gender'] == 'brand'])
    print("Brand Profile Count " + str(brand_profile_count))
    
    # Using 1 feature 
    print("Using 1 feature")
    features = ['text_cleaned', 'description_cleaned', 'sidebar_color', 'link_color','name']
    for feature in features:
        print("feature:",feature)
        X, Y, x, y = split_dataset_for_one_attribute(gender_confident_data)
        train_and_test_model(X, Y, x, y)
                
    #Using 4 features
    print("Using 4 features")
    attributes = [['text_cleaned', 'description_cleaned', 'sidebar_color', 'link_color'],
                  ['text_cleaned', 'description_cleaned', 'sidebar_color',  'name'],
                  ['text_cleaned', 'description_cleaned',  'link_color', 'name'],
                  ['text_cleaned',  'sidebar_color', 'link_color', 'name'],
                  ['description_cleaned', 'sidebar_color', 'link_color', 'name']]
    for attribute in attributes:
        print("features:",attribute)
        X, Y, x, y = select_features(gender_confident_data,attribute)
        trained_model = train_and_test_model(X, Y, x, y)
    
    #Using 3 features 
    three_attributes = attributes = [[ 'sidebar_color', 'link_color','name'],
                                     ['sidebar_color', 'link_color','text_cleaned'],
                                     ['sidebar_color', 'link_color', 'description_cleaned'],
                              ['text_cleaned',  'link_color', 'name'],
                              ['text_cleaned', 'link_color', 'name'],
                  ['description_cleaned','description_cleaned','name']]
    for attribute in three_attributes:
        print(attribute)
        X, Y, x, y = select_features(gender_confident_data,attribute)
        trained_model = train_and_test_model(X, Y, x, y)
        
    
    # Using 2 features profile description and tweets description
    print("Combining two features")
    X, Y, x, y = combine_tweet_and_description(data)
    train_and_test_model(X, Y, x, y)
    
    # Using 5 features
    X, Y, x, y = select_important_features(gender_confident_data)
    trained_model = train_and_test_model(X, Y, x, y)



if __name__ == "__main__":
    main()

Column names which has missing values
['_last_judgment_at', 'gender', 'gender:confidence', 'description', 'gender_gold', 'profile_yn_gold', 'tweet_coord', 'tweet_location', 'user_timezone']
Male Profile Count 6194
Female Profile Count 6700
Brand Profile Count 5942
['_unit_id', '_golden', '_unit_state', '_trusted_judgments', '_last_judgment_at', 'gender', 'gender:confidence', 'profile_yn', 'profile_yn:confidence', 'created', 'description', 'fav_number', 'gender_gold', 'link_color', 'name', 'profile_yn_gold', 'profileimage', 'retweet_count', 'sidebar_color', 'text', 'tweet_coord', 'tweet_count', 'tweet_created', 'tweet_id', 'tweet_location', 'user_timezone', 'text_cleaned', 'description_cleaned']
Male Profile Count 4653
Female Profile Count 5367
Brand Profile Count 3784
Using 1 feature
feature: text_cleaned
Naive Bayes
0.5526203664252237

Decision Tree
0.48146570089475926

Support Vector Machines
0.5543246697912229

feature: description_cleaned
Naive Bayes
0.5374946740519813

Decision Tr