# Evaluating gemma-7b-it for text classification
## Task: Identify if a tweet is about a "disaster"

### EDA and Preprocessing

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_colwidth', None)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Additional
import re
import html

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
# Read data
submission_path = '/kaggle/input/nlp-getting-started/sample_submission.csv'
train_path = '/kaggle/input/nlp-getting-started/train.csv'
test_path = '/kaggle/input/nlp-getting-started/test.csv'

submission_df = pd.read_csv(submission_path)
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [3]:
# Describe
print("train")
print(train_df.shape)
print(train_df.columns.tolist())
print(train_df.target.value_counts(normalize = True))

print("\ntest")
print(test_df.shape)
print(test_df.columns.tolist())
print(train_df.target.value_counts(normalize = True))

train
(7613, 5)
['id', 'keyword', 'location', 'text', 'target']
target
0    0.57034
1    0.42966
Name: proportion, dtype: float64

test
(3263, 4)
['id', 'keyword', 'location', 'text']
target
0    0.57034
1    0.42966
Name: proportion, dtype: float64


In [4]:
# What is a "disater"?
train_df.sample(5)

Unnamed: 0,id,keyword,location,text,target
2607,3742,destroyed,USA,Black Eye 9: A space battle occurred at Star O784 involving 2 fleets totaling 3934 ships with 7 destroyed,0
1527,2209,chemical%20emergency,,Emergency Response and Hazardous Chemical Management: Principles and Practices http://t.co/4sSuyhkgRB http://t.co/TDerBtgZ2k,0
2273,3260,demolish,State of Dreaming,Just us four can demolish this?? @Createdunique23 @Keren_Serpa @ArianaReed11 https://t.co/PCiNc8ytFH,0
5574,7954,rainstorm,"Pioneer Village, KY",'The way you move is like a full on rainstorm and I'm a house of cards',1
4307,6115,hellfire,"570 Vanderbilt; Brooklyn, NY",New cocktail on the list! El Diablo Mas Verde: mezcal yellow chartreuse honey cucumber hellfire bitters.... http://t.co/REuosJEK4m,0


* Texts have keywords which appear in them (possibly used for searching tweets that could be referring disaster).
* Target: 1 = Actual Disaster, 0 = Not Disaster.
* Location provides additional context to tweet.

In [5]:
# describe columns
def describe_column(df, column_name, sample_size = 20):
    print(
        f"Column: {column_name}\n"
        f"Number of rows: {df.shape[0]}\n"
        f"Number of unique values: {len(df[column_name].unique())}\n"
        f"Empty %: {round(((df[column_name].isna().sum() / df.shape[0]) * 100), 2)}%\n"
        f"Sample: {df[column_name].unique()[:sample_size]}"
    )

In [6]:
# keyword
describe_column(train_df, 'keyword')

Column: keyword
Number of rows: 7613
Number of unique values: 222
Empty %: 0.8%
Sample: [nan 'ablaze' 'accident' 'aftershock' 'airplane%20accident' 'ambulance'
 'annihilated' 'annihilation' 'apocalypse' 'armageddon' 'army' 'arson'
 'arsonist' 'attack' 'attacked' 'avalanche' 'battle' 'bioterror'
 'bioterrorism' 'blaze']


In [7]:
# location
describe_column(train_df, 'location')

Column: location
Number of rows: 7613
Number of unique values: 3342
Empty %: 33.27%
Sample: [nan 'Birmingham' 'Est. September 2012 - Bristol' 'AFRICA'
 'Philadelphia, PA' 'London, UK' 'Pretoria' 'World Wide!!'
 'Paranaque City' 'Live On Webcam' 'milky way' 'GREENSBORO,NORTH CAROLINA'
 'England.' 'Sheffield Township, Ohio' 'India' 'Barbados' 'Anaheim'
 'Abuja' 'USA' 'South Africa']


In [8]:
# location
describe_column(train_df, 'target', 2)

Column: target
Number of rows: 7613
Number of unique values: 2
Empty %: 0.0%
Sample: [1 0]


In [9]:
# Empty keywords
train_df[train_df['keyword'].isna()].sample(5)

Unnamed: 0,id,keyword,location,text,target
26,38,,,Was in NYC last week!,0
19,28,,,What a goooooooaaaaaal!!!!!!,0
7589,10843,,,Omg earthquake,1
7585,10839,,,Calgary Police Flood Road Closures in Calgary. http://t.co/RLN09WKe9g,1
18,26,,,My car is so fast,0


* No reason for Text's with missing keywords to be removed.
* Duplicate Texts need further investigation.

In [10]:
# Do texts vary with target, location or keyword.
train_text_dupes = train_df[train_df.duplicated(subset='text', keep=False)]
print(f"Count of text dupes: {train_text_dupes.shape[0]}")
train_vary_target = train_text_dupes.groupby('text').filter(
    lambda x: x['target'].nunique() == 2).sort_values('text').iloc[:10]
train_vary_target

Count of text dupes: 179


Unnamed: 0,id,keyword,location,text,target
4290,6094,hellfire,"Jubail IC, Saudi Arabia.",#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect,0
4299,6105,hellfire,?????? ??? ?????? ????????,#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect,0
4312,6123,hellfire,?????? ???? ??????,#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect,1
4244,6031,hazardous,"New Delhi, Delhi",#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption,0
4221,5996,hazardous,,#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption,1
4239,6023,hazardous,"Mysore, Karnataka",#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption,1
2832,4076,displaced,Pedophile hunting ground,.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4,0
2831,4072,displaced,Pedophile hunting ground,.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4,1
2830,4068,displaced,Pedophile hunting ground,.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4,1
2833,4077,displaced,Pedophile hunting ground,.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4,1


In [11]:
# Are there duplicate texts in the train and test df
test_df[test_df['text'].isin(train_df['text'])]

Unnamed: 0,id,keyword,location,text
18,58,ablaze,Live On Webcam,Check these out: http://t.co/rOI2NSmEJJ http://t.co/3Tj8ZjiN21 http://t.co/YDUiXEfIpE http://t.co/LxTjc87KLS #nsfw
47,152,aftershock,Switzerland,320 [IR] ICEMOON [AFTERSHOCK] | http://t.co/THyzOMVWU0 | @djicemoon | #Dubstep #TrapMusic #DnB #EDM #Dance #IcesÛ_ http://t.co/83jOO0xk29
53,177,aftershock,Switzerland,320 [IR] ICEMOON [AFTERSHOCK] | http://t.co/THyzOMVWU0 | @djicemoon | #Dubstep #TrapMusic #DnB #EDM #Dance #IcesÛ_ http://t.co/83jOO0xk29
55,181,aftershock,304,'There is no victory at bargain basement prices.' Dwight David Eisenhower
150,497,army,Campinas Sp,You da One \n\n#MTVSummerStar #VideoVeranoMTV #MTVHottest Britney Spears Lana Del Rey
...,...,...,...,...
3090,10232,volcano,Planet Earth,Learning from the Legacy of a Catastrophic Eruption - The New Yorker http://t.co/y8YqPBE4t9
3230,10738,wreckage,India,Wreckage 'Conclusively Confirmed' as From MH370: Malaysia PM
3235,10758,wreckage,India,Wreckage 'Conclusively Confirmed' as From MH370: Malaysia PM
3259,10865,,,Storm in RI worse than last hurricane. My city&amp;3others hardest hit. My yard looks like it was bombed. Around 20000K still without power


* Remove all duplicates that are in train and test df from train df.
* Remove all duplicates that vary by target in train_df (keeping neither).
* Since we have enough records we choose to remove rather than re-label.
* Remove all duplicates that vary by location in train_df (keeping first - random).

In [12]:
# Remove all duplicates that are in train and test df from train df.

# Count of rows in test_df before dropping
print(f"Number of rows before: {train_df.shape[0]}")

# test_df rows with 'text' appearing in train_df
rows_to_drop = train_df['text'].isin(test_df['text'])

# Dropping these rows from test_df
train_df = train_df[~rows_to_drop]

# Count of rows in test_df after dropping
print(f"Number of rows after: {train_df.shape[0]}")

Number of rows before: 7613
Number of rows after: 7486


In [13]:
# Remove all duplicates that vary by target in train_df (keeping neither).

# Count of rows in train_df before dropping
print(f"Number of rows before: {train_df.shape[0]}")

# Dropping train_vary_target rows from train_df
train_df = train_df[~train_df['text'].isin(train_vary_target['text'])]

# Count of rows in test_df after dropping
print(f"Number of rows after: {train_df.shape[0]}")

Number of rows before: 7486
Number of rows after: 7483


In [14]:
# Remove duplicates that vary by location in train_df (keeping first - random).

# Count of rows in train_df before dropping
print(f"Number of rows before: {train_df.shape[0]}")

# Remove duplicates in the 'text' column, keeping the first occurrence
train_df = train_df.drop_duplicates('text', keep='first')

# Count of rows in test_df after dropping
print(f"Number of rows after: {train_df.shape[0]}")

Number of rows before: 7483
Number of rows after: 7434


In [15]:
# Clean text
def clean_text(text):
    # Skip if not string
    if not isinstance(text, str):
        return text    
    text = html.unescape(text)     # Convert HTML character codes
    text = re.sub(r'https?://\S+|www\.\S+', '', text)    # Remove URLs
    # Remove if not letter, numbers, normal punctuation, spaces, @, #
    text = re.sub(r'[^a-zA-Z0-9\s,.?!#@]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()     # Remove extra spaces
    return text

In [16]:
# Text
train_df['text_clean'] = train_df['text'].apply(clean_text)
test_df['text_clean'] = test_df['text'].apply(clean_text)

# Location
train_df['location'] = train_df['location'].fillna('Unknown') # Fill NaNs
test_df['location'] = test_df['location'].fillna('Unknown') # Fill NaNs
train_df['location_clean'] = train_df['location'].apply(clean_text)
test_df['location_clean'] = test_df['location'].apply(clean_text)

# Combining (adding location context to text)
train_df['text_location'] = train_df.apply(
    lambda row: f"Location: {row['location']}. Tweet: {row['text']}", axis=1)
test_df['text_location'] = test_df.apply(
    lambda row: f"Location: {row['location']}. Tweet: {row['text']}", axis=1)

# Combining clean (adding location context to text)
train_df['text_location_clean'] = train_df.apply(
    lambda row: f"Location: {row['location_clean']}. Tweet: {row['text_clean']}", axis=1)
test_df['text_location_clean'] = test_df.apply(
    lambda row: f"Location: {row['location_clean']}. Tweet: {row['text_clean']}", axis=1)

# Class
train_df['class'] = train_df['target'].apply(lambda x: 'yes' if x == 1 else 'no')

In [17]:
# gemma it chat template
# chat = [
#     { "role": "user", "content": "Write a hello world program" },
# ]

# <bos><start_of_turn>user
# Write a hello world program<end_of_turn>
# <start_of_turn>model

# Add prompt field
def generate_prompt(X, text):
    return f"""<bos><start_of_turn>user
Identify if the folowing Tweet is part of a thread discussing a disaster (yes/ no):
[ {X[text]} ] <end_of_turn>
<start_of_turn>model
{X['class']}"""

def generate_test_prompt(X, text):
    return f"""<bos><start_of_turn>user
Identify if the folowing Tweet is part of a thread discussing a disaster (yes/ no):
[ {X[text]} ] <end_of_turn>
<start_of_turn>model
"""

In [18]:
# Train -----------------------------------------------------------------

# Text
train_df['prompt_text'] = train_df.apply(lambda row: generate_prompt(row, 'text'), axis=1)
# Text test
train_df['prompt_text_test'] = train_df.apply(lambda row: generate_test_prompt(row, 'text'), axis=1)
# Text and location
train_df['prompt_text_location'] = train_df.apply(lambda row: generate_prompt(row, 'text_location'), axis=1)
# Text and location test
train_df['prompt_text_location_test'] = train_df.apply(lambda row: generate_test_prompt(row, 'text_location'), axis=1)
# Text clean
train_df['prompt_text_clean'] = train_df.apply(lambda row: generate_prompt(row, 'text_clean'), axis=1)
# Text clean test
train_df['prompt_text_clean_test'] = train_df.apply(lambda row: generate_test_prompt(row, 'text_clean'), axis=1)
# Text location clean
train_df['prompt_text_location_clean'] = train_df.apply(lambda row: generate_prompt(row, 'text_location_clean'), axis=1)
# Text location clean test
train_df['prompt_text_location_clean_test'] = train_df.apply(lambda row: generate_test_prompt(row, 'text_location_clean'), axis=1)

# Test ------------------------------------------------------------------

# Text test
test_df['prompt_text_test'] = train_df.apply(lambda row: generate_test_prompt(row, 'text'), axis=1)
# Text and location test
test_df['prompt_text_location_test'] = train_df.apply(lambda row: generate_test_prompt(row, 'text_location'), axis=1)
# Text clean test
test_df['prompt_text_clean_test'] = train_df.apply(lambda row: generate_test_prompt(row, 'text_clean'), axis=1)
# Text location clean test
train_df['prompt_text_location_clean_test'] = train_df.apply(lambda row: generate_test_prompt(row, 'text_location_clean'), axis=1)


train_df.sample(2)

Unnamed: 0,id,keyword,location,text,target,text_clean,location_clean,text_location,text_location_clean,class,prompt_text,prompt_text_test,prompt_text_location,prompt_text_location_test,prompt_text_clean,prompt_text_clean_test,prompt_text_location_clean,prompt_text_location_clean_test
6663,9551,threat,"Ohio, USA",The few I warned about .. Were just as I expected.. They are a threat to his soul,0,The few I warned about .. Were just as I expected.. They are a threat to his soul,"Ohio, USA","Location: Ohio, USA. Tweet: The few I warned about .. Were just as I expected.. They are a threat to his soul","Location: Ohio, USA. Tweet: The few I warned about .. Were just as I expected.. They are a threat to his soul",no,<bos><start_of_turn>user\nIdentify if the folowing Tweet is part of a thread discussing a disaster (yes/ no):\n[ The few I warned about .. Were just as I expected.. They are a threat to his soul ] <end_of_turn>\n<start_of_turn>model\nno,<bos><start_of_turn>user\nIdentify if the folowing Tweet is part of a thread discussing a disaster (yes/ no):\n[ The few I warned about .. Were just as I expected.. They are a threat to his soul ] <end_of_turn>\n<start_of_turn>model\n,"<bos><start_of_turn>user\nIdentify if the folowing Tweet is part of a thread discussing a disaster (yes/ no):\n[ Location: Ohio, USA. Tweet: The few I warned about .. Were just as I expected.. They are a threat to his soul ] <end_of_turn>\n<start_of_turn>model\nno","<bos><start_of_turn>user\nIdentify if the folowing Tweet is part of a thread discussing a disaster (yes/ no):\n[ Location: Ohio, USA. Tweet: The few I warned about .. Were just as I expected.. They are a threat to his soul ] <end_of_turn>\n<start_of_turn>model\n",<bos><start_of_turn>user\nIdentify if the folowing Tweet is part of a thread discussing a disaster (yes/ no):\n[ The few I warned about .. Were just as I expected.. They are a threat to his soul ] <end_of_turn>\n<start_of_turn>model\nno,<bos><start_of_turn>user\nIdentify if the folowing Tweet is part of a thread discussing a disaster (yes/ no):\n[ The few I warned about .. Were just as I expected.. They are a threat to his soul ] <end_of_turn>\n<start_of_turn>model\n,"<bos><start_of_turn>user\nIdentify if the folowing Tweet is part of a thread discussing a disaster (yes/ no):\n[ Location: Ohio, USA. Tweet: The few I warned about .. Were just as I expected.. They are a threat to his soul ] <end_of_turn>\n<start_of_turn>model\nno","<bos><start_of_turn>user\nIdentify if the folowing Tweet is part of a thread discussing a disaster (yes/ no):\n[ Location: Ohio, USA. Tweet: The few I warned about .. Were just as I expected.. They are a threat to his soul ] <end_of_turn>\n<start_of_turn>model\n"
5869,8386,ruin,MNL,@clnv_ Yes Yes! I will. I dont wanna ruin my life. Lol,0,@clnv Yes Yes! I will. I dont wanna ruin my life. Lol,MNL,Location: MNL. Tweet: @clnv_ Yes Yes! I will. I dont wanna ruin my life. Lol,Location: MNL. Tweet: @clnv Yes Yes! I will. I dont wanna ruin my life. Lol,no,<bos><start_of_turn>user\nIdentify if the folowing Tweet is part of a thread discussing a disaster (yes/ no):\n[ @clnv_ Yes Yes! I will. I dont wanna ruin my life. Lol ] <end_of_turn>\n<start_of_turn>model\nno,<bos><start_of_turn>user\nIdentify if the folowing Tweet is part of a thread discussing a disaster (yes/ no):\n[ @clnv_ Yes Yes! I will. I dont wanna ruin my life. Lol ] <end_of_turn>\n<start_of_turn>model\n,<bos><start_of_turn>user\nIdentify if the folowing Tweet is part of a thread discussing a disaster (yes/ no):\n[ Location: MNL. Tweet: @clnv_ Yes Yes! I will. I dont wanna ruin my life. Lol ] <end_of_turn>\n<start_of_turn>model\nno,<bos><start_of_turn>user\nIdentify if the folowing Tweet is part of a thread discussing a disaster (yes/ no):\n[ Location: MNL. Tweet: @clnv_ Yes Yes! I will. I dont wanna ruin my life. Lol ] <end_of_turn>\n<start_of_turn>model\n,<bos><start_of_turn>user\nIdentify if the folowing Tweet is part of a thread discussing a disaster (yes/ no):\n[ @clnv Yes Yes! I will. I dont wanna ruin my life. Lol ] <end_of_turn>\n<start_of_turn>model\nno,<bos><start_of_turn>user\nIdentify if the folowing Tweet is part of a thread discussing a disaster (yes/ no):\n[ @clnv Yes Yes! I will. I dont wanna ruin my life. Lol ] <end_of_turn>\n<start_of_turn>model\n,<bos><start_of_turn>user\nIdentify if the folowing Tweet is part of a thread discussing a disaster (yes/ no):\n[ Location: MNL. Tweet: @clnv Yes Yes! I will. I dont wanna ruin my life. Lol ] <end_of_turn>\n<start_of_turn>model\nno,<bos><start_of_turn>user\nIdentify if the folowing Tweet is part of a thread discussing a disaster (yes/ no):\n[ Location: MNL. Tweet: @clnv Yes Yes! I will. I dont wanna ruin my life. Lol ] <end_of_turn>\n<start_of_turn>model\n


In [19]:
# Save files
train_df.to_csv('disaster_train.csv', index=False)
test_df.to_csv('disaster_test.csv', index=False)
submission_df.to_csv('disaster_submission.csv', index=False)