In [1]:
from ydata_profiling import ProfileReport
import pandas as pd
import numpy as np

# Import Data

In [2]:
df = pd.read_csv(r'C:\Users\johna\anaconda3\envs\twitter-env-1\Data\For Candidate - Data Scientist Role\twitter_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20048 entries, 0 to 20047
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   unit_id                20048 non-null  int64 
 1   gender                 19949 non-null  object
 2   created                20048 non-null  object
 3   user_description       16304 non-null  object
 4   user_favourite_number  20048 non-null  int64 
 5   link_color             20048 non-null  object
 6   name                   20048 non-null  object
 7   profileimage           20048 non-null  object
 8   retweet_count          20048 non-null  int64 
 9   sidebar_color          20048 non-null  object
 10  tweet_text             20048 non-null  object
 11  tweet_coord            159 non-null    object
 12  tweet_count            20048 non-null  int64 
 13  tweet_location         12564 non-null  object
 14  user_timezone          12250 non-null  object
dtypes: int64(4), object

In [3]:
profile = ProfileReport(df, title="Twitter Profiling Report")
profile.to_file(r"C:\Users\johna\anaconda3\envs\twitter-env-1\Reports\twitter profile report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Data Processing 

In [4]:
import pandas as pd
import numpy as np

class DataProcessor:
    
    def __init__(self, df):
        self.df = df.copy()
    
    def drop_unknown_users(self):
        self.df = self.df[self.df['gender'] != 'unknown']
        #self.df["gender"] = self.df['gender'].astype("category")
    
    def replace_blank_user_description(self):
        self.df['user_description'] = self.df['user_description'].fillna('no user description')

        
    def drop_columns(self, columns_to_drop):
        self.df = self.df.drop(columns=columns_to_drop)
        
    def isolate_duplicates(self):
        duplicate_names = self.df[self.df.duplicated('name')]['name']
        self.duplicates = self.df[self.df['name'].isin(duplicate_names)]
        self.df = self.df[~self.df['name'].isin(duplicate_names)]
    
    def split_data(self):
        # Split into train, validation, and test sets
        train_pct = 0.7
        val_pct = 0.15
        test_pct = 0.15

        # Set random seed for reproducibility
        np.random.seed(42)

        # Shuffle the data
        shuffled_df = self.df.sample(frac=1)

        # Determine the number of rows for each set
        num_rows = len(shuffled_df)
        train_rows = int(num_rows * train_pct)
        val_rows = int(num_rows * val_pct)
        test_rows = num_rows - train_rows - val_rows

        # Split the data into sets
        self.train_df = shuffled_df.iloc[:train_rows]
        self.val_df = shuffled_df.iloc[train_rows:train_rows+val_rows]
        self.test_df = shuffled_df.iloc[train_rows+val_rows:]
        
        # Add duplicates to training set
        self.train_df = pd.concat([self.train_df, self.duplicates])
        
        # Ensure no overlap in names
        train_names = set(self.train_df['name'])
        val_names = set(self.val_df['name'])
        test_names = set(self.test_df['name'])
        
        assert len(train_names.intersection(val_names)) == 0, "Overlap in names between train and validation sets"
        assert len(train_names.intersection(test_names)) == 0, "Overlap in names between train and test sets"
        assert len(val_names.intersection(test_names)) == 0, "Overlap in names between validation and test sets"
        
    def process_data(self, cols_to_drop):
        
        self.drop_unknown_users()
        self.replace_blank_user_description()
        self.drop_columns(cols_to_drop)
        self.isolate_duplicates()
        self.split_data()


In [5]:
processor = DataProcessor(df)
cols_to_drop = ["tweet_coord", "user_timezone", "tweet_location", "unit_id"]
processor.process_data(cols_to_drop)
# create train, validation and test
train_df = processor.train_df
validation_df = processor.val_df
test_df = processor.test_df

In [6]:
# Run profiling reports for data prep steps
profile_train = ProfileReport(train_df, title="Training Data: twitter profile report")
profile_train.to_file(r"C:\Users\johna\anaconda3\envs\twitter-env-1\Reports\Training Data-twitter profile report.html")

profile_validation = ProfileReport(validation_df, title="Validation Data: twitter profile report")
profile_validation.to_file(r"C:\Users\johna\anaconda3\envs\twitter-env-1\Reports\Validation Data-twitter profile report.html")

profile_test = ProfileReport(test_df, title="Test Data: twitter profile report")
profile_test.to_file(r"C:\Users\johna\anaconda3\envs\twitter-env-1\Reports\Test Data-twitter profile report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
# Generate comparison reports
comparison_train_val = profile_train.compare(profile_validation)
comparison_train_val.to_file(r"C:\Users\johna\anaconda3\envs\twitter-env-1\Reports\Comparison-train_vs_validation.html")

comparison_train_test = profile_train.compare(profile_test)
comparison_train_test.to_file(r"C:\Users\johna\anaconda3\envs\twitter-env-1\Reports\Comparison-train_vs_test.html")

comparison_val_test = profile_validation.compare(profile_test)
comparison_val_test.to_file(r"C:\Users\johna\anaconda3\envs\twitter-env-1\Reports\Comparison-val_vs_test.html")


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
# Create labels and features

def features_labels(df, label, feature):
    y = list(df[label].astype("category").cat.codes)
    X = list((df[feature]))
    return y, X

train_labels, train_user_description = features_labels(train_df, label="gender", feature="user_description")
validation_labels, validation_user_description = features_labels(validation_df, label="gender", feature="user_description")
test_labels, test_user_description = features_labels(test_df, label="gender", feature="user_description")

In [9]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-multilingual-cased')

train_encodings = tokenizer(train_user_description, truncation=True, padding=True)
val_encodings = tokenizer(validation_user_description, truncation=True, padding=True)
test_encodings = tokenizer(test_user_description, truncation=True, padding=True)

In [10]:
# Create Dataset object

import torch

class TwitterDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TwitterDataset(train_encodings, train_labels)
validation_dataset = TwitterDataset(val_encodings, validation_labels)
test_dataset = TwitterDataset(test_encodings, test_labels)

In [12]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=r'C:\Users\johna\anaconda3\envs\twitter-env-1\Data\Model Training Results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=r'C:\Users\johna\anaconda3\envs\twitter-env-1\Data\Model Training Logs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=3)

trainer = Trainer(
    model=model,                         # the instantiated  Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=validation_dataset      # evaluation dataset
)

trainer.train()

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'pre_classif

Step,Training Loss


KeyboardInterrupt: 