# Understanding Gender Differences in Tweets with Deep Learning 

***

## Summer Chambers (ssc4mc), Keyu Chen (km5ar), Hannah Frederick (hbf3k)  

The .csv file that we read in can be found here: https://www.kaggle.com/crowdflower/twitter-user-gender-classification

The code that we source from can be found here: https://www.kaggle.com/gcdatkin/deep-recurrent-nn-for-gender-classification#Data-Every-Day

Ensemble article: https://sailajakarra.medium.com/ensemble-scikit-learn-and-keras-be93206c54c4


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow import keras
import nltk
nltk.download('wordnet')
import pydot
import graphviz
from sklearn.metrics import confusion_matrix, classification_report

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [5]:
data = pd.read_csv("/content/drive/MyDrive/gender-classifier-DFE-791531.csv", encoding = "latin1")
#data = pd.read_csv('/content/drive/MyDrive/DL_Final_code/gender-classifier-DFE-791531.csv', encoding='latin-1')

In [6]:
def get_sequences(texts, vocab_length):
    lemma = nltk.WordNetLemmatizer()
    texts = [lemma.lemmatize(word) for word in texts]

    tokenizer = Tokenizer(num_words=vocab_length)
    tokenizer.fit_on_texts(texts)
    
    sequences = tokenizer.texts_to_sequences(texts)
    
    max_seq_length = np.max([len(sequence) for sequence in sequences])
    
    sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')
    
    return sequences

In [7]:
def preprocess_input(df, vocab_length=20000):
    df = df.copy()
    
    # Drop unnecessary columns
    df = df[['gender','text', 'description']]
    
    # Encode unknown values in the target column as np.NaN
    df['gender'] = df['gender'].replace('unknown', np.NaN)
    
    # Drop rows with missing target values
    gender_nas = df[df['gender'].isna()].index
    df = df.drop(gender_nas, axis=0).reset_index(drop=True)

    # Drop rows with "brand" target values
    gender_brand = df[df['gender'] == 'brand'].index
    df = df.drop(gender_brand, axis=0).reset_index(drop=True)

    # Let's encode the missing values in the description column as empty strings
    df['description'] = df['description'].fillna('')

    # Get sequence data for text and description columns
    text = df['description'] + ' ' + df['text']
    text = get_sequences(text, vocab_length)
    
    # Encode label column
    label_mapping = {'female': 0, 'male': 1}
    df['gender'] = df['gender'].replace(label_mapping)
    
    # Get target data from gender column
    y = df['gender'].copy()
    
    return text, y

In [8]:
text, y = preprocess_input(data)


In [9]:
text_train, text_test, y_train, y_test = train_test_split(text, y, train_size=0.8, random_state=1)

In [10]:
project_id = "mnist-test-311618"
import os

In [11]:
import googleapiclient.discovery
from google.api_core.client_options import ClientOptions

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/drive/MyDrive/gru-mnist-test-311618-c981306c5b3b.json"
model_id = "my_gru_model"
model_path = "projects/{}/models/{}".format(project_id, model_id)
model_path += "/versions/v0001/" # if you want to run a specific version
#ml_resource = googleapiclient.discovery.build("ml", "v1").projects()
endpoint = 'https://us-central1-ml.googleapis.com'
client_options = ClientOptions(api_endpoint=endpoint)
ml_resource = googleapiclient.discovery.build("ml", "v1", client_options=client_options, cache_discovery=False).projects()

In [12]:
model_path

'projects/mnist-test-311618/models/my_gru_model/versions/v0001/'

In [13]:
input_data_json = {"signature_name": "serving_default",
                       "instances": text_train[:1].tolist()}
request = ml_resource.predict(name=model_path, body=input_data_json)
response = request.execute()

In [14]:
response

{'predictions': [[0.782621562, 0.217378497]]}

In [15]:
y_train[:1]

4740    0
Name: gender, dtype: int64

In [6]:
def build_model(out_dim=256, train_bool=True, learning_rate=0.0001):

    text_inputs = tf.keras.Input(shape=(text.shape[1],))

    #desc
    text_embedding = tf.keras.layers.Embedding(
        input_dim=20000,
        output_dim=out_dim,
        input_length=text.shape[1],
        trainable=train_bool
    )(text_inputs)
    text_gru = tf.keras.layers.GRU(out_dim, return_sequences=False)(text_embedding)
    text_flatten = tf.keras.layers.Flatten()(text_embedding)
    text_concat = tf.keras.layers.concatenate([text_gru, text_flatten])

    outputs = tf.keras.layers.Dense(2, activation='softmax')(text_concat)
    model = tf.keras.Model(inputs=text_inputs, outputs=outputs)
    optimizer = keras.optimizers.Adam(lr=learning_rate)
    model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])  
    return model

In [15]:
batch_size = 32
epochs = 1

best_params = {'train_bool': True, 'out_dim': 256, 'learning_rate': 0.0001}
model = build_model(out_dim=best_params["out_dim"], train_bool=best_params["train_bool"], learning_rate=best_params['learning_rate'])
model.fit(text_train,
          y_train,
          #batch_size=batch_size,
          epochs=epochs,
          callbacks=[keras.callbacks.EarlyStopping(monitor="loss", patience=10)])

  8/323 [..............................] - ETA: 2:31 - loss: 0.6916 - accuracy: 0.5278

KeyboardInterrupt: ignored