## Microsoft Learn Project
* This Notebook has been powered by and inspired by:
<div class="align-center">
  <a href="https://github.com/unslothai/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>

</div>


In [None]:
from google.colab import drive, files
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# We have to check which Torch version for Xformers (2.3 -> 0.0.27)
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [None]:
#add your huggingface token to get access to the hub
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Import necessary dependencies


In [None]:
import warnings
warnings.filterwarnings("ignore")
import os
import re
import subprocess
import time
import pandas as pd
from google.colab import files
from typing import Optional



In [None]:
class Config:
  device = 'gpu' #change this to cpu and change runtime to cpu but inference is extremely slow and will take approximately 2 days max, gpu inference takes only 2hrs
  bits = 8
  folder_path = '/content/gdrive/MyDrive/Microsoft_Learn_Location_Mention-Recognition-Challenge/data/'#folder with the tg booklets
  train_filepath = '/content/gdrive/MyDrive/Microsoft_Learn_Location_Mention-Recognition-Challenge/data/Train.csv'
  test_filepath = '/content/gdrive/MyDrive/Microsoft_Learn_Location_Mention-Recognition-Challenge/data/Test.csv'
  model_name = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit"
  max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
  dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
  load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
  finetuned_model= 'Koleshjr/microsoft_learn_2_epochs_gemma_zindi_data_v2'#Koleshjr/microsoft_learn_2_epochs_mistral_2e_4_v5_sorted_train_1



### The Exciting Part
* If device is gpu we then we just load our custom finetuned model from huggingface and perform inference


In [None]:

#load the unquantized model that you custom finetuned from huggingface
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = Config.finetuned_model, # YOUR MODEL That you pushed to huggingface
    max_seq_length = Config.max_seq_length,
    dtype = Config.dtype,
    load_in_4bit = Config.load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.9.post2: Fast Gemma2 patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2024.9.post2 patched 42 layers with 42 QKV layers, 42 O layers and 42 MLP layers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma2ForCausalLM(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 3584)
        (layers): ModuleList(
          (0-41): 42 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3584, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3584, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit(
  

In [None]:
test_df = pd.read_csv(Config.test_filepath)
def clean_text(text):
    # Remove retweets
    text = re.sub(r'^RT\s+', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove special characters and emojis
    text = re.sub(r'[^a-zA-Z0-9\s,.]', '', text)
    # Remove non-ASCII characters
    text = text.encode('ascii', 'ignore').decode('ascii')
    # Trim extra whitespace
    text = text.strip()
    return text

# test_df['text'] = test_df['text'].apply(clean_text)
inputs = tokenizer(
    [
      f"""<s>[INST] Extract all location names from the given tweet and list them alphabetically, separating each location with a single space., following these guidelines:

Include:
- City/town,
- Country,
- State,
- Human-made Point-of-Interest,
- Neighborhood,
- Island,
- County,
- Continent,
- Road/street,
- District,
- Natural Point-of-Interest
- Other administrative locations
- Geo-points (e.g., Eiffel Tower, Statue of Liberty, Mount Kilimanjaro)
- Geo-lines (e.g., Mississippi River, Andes Mountains, Great Wall of China)
- Geo-areas (e.g., New York City, France, Amazon Rainforest, Mediterranean Sea)

Exclude:
- Addresses (e.g., 123 Main St)
- Relative location expressions (e.g., "nearby", "downtown")
- Generic location terms (e.g., "park", "river" without a specific name)

Additional rules:
1. Extract only explicitly named locations falling into the above categories.
2. Present them in alphabetical order.
3. If a location appears multiple times in the tweet with different casing (even if it's just one character different or difference case), include each variation. If the casing is identical, include it only once.
4. Include nested locations separately (e.g., for "Central Park in New York City", list both "Central Park" and "New York City").
5. Include abbreviated forms of locations (e.g., "NYC") and list them where they would appear alphabetically.
6. For locations with qualifiers, include the full phrase (e.g., "Northern France", "Downtown Chicago").
7. Treat multi-word location names as a single entry (e.g., "New York City" is one entry).
8. For locations in hashtags, include the location name without the hash symbol.
9. For ambiguous terms that could be either locations or something else, use context to determine if it's a location.

Tweet: {test_df['text'][0]} [/INST]"""
    ], return_tensors = "pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens = 1012, use_cache = True)
result = tokenizer.batch_decode(outputs)
print(result)


['<bos><s>[INST] Extract all location names from the given tweet and list them alphabetically, separating each location with a single space., following these guidelines:\n\nInclude:\n- City/town,\n- Country,\n- State,\n- Human-made Point-of-Interest,\n- Neighborhood,\n- Island,\n- County,\n- Continent,\n- Road/street,\n- District,\n- Natural Point-of-Interest\n- Other administrative locations\n- Geo-points (e.g., Eiffel Tower, Statue of Liberty, Mount Kilimanjaro)\n- Geo-lines (e.g., Mississippi River, Andes Mountains, Great Wall of China)\n- Geo-areas (e.g., New York City, France, Amazon Rainforest, Mediterranean Sea)\n\nExclude:\n- Addresses (e.g., 123 Main St)\n- Relative location expressions (e.g., "nearby", "downtown")\n- Generic location terms (e.g., "park", "river" without a specific name)\n\nAdditional rules:\n1. Extract only explicitly named locations falling into the above categories.\n2. Present them in alphabetical order.\n3. If a location appears multiple times in the twee

In [None]:
def get_answer(pred):
  try:
    return pred[0].split('[/INST]')[1].split('</s>')[0].strip()
  except:
    print(pred)
    return pred.split('[/INST]')[1].split('</s>')[0].strip()

get_answer(result)

'Maryland New England New Orleans<eos>'

In [None]:

processing_times = []
#Perform inference
for row in test_df.itertuples():
  try:
    start_time = time.time()
    inputs = tokenizer(
    [
      f"""<s>[INST] Extract all location names from the given tweet and list them alphabetically, separating each location with a single space., following these guidelines:

Include:
- City/town,
- Country,
- State,
- Human-made Point-of-Interest,
- Neighborhood,
- Island,
- County,
- Continent,
- Road/street,
- District,
- Natural Point-of-Interest
- Other administrative locations
- Geo-points (e.g., Eiffel Tower, Statue of Liberty, Mount Kilimanjaro)
- Geo-lines (e.g., Mississippi River, Andes Mountains, Great Wall of China)
- Geo-areas (e.g., New York City, France, Amazon Rainforest, Mediterranean Sea)

Exclude:
- Addresses (e.g., 123 Main St)
- Relative location expressions (e.g., "nearby", "downtown")
- Generic location terms (e.g., "park", "river" without a specific name)

Additional rules:
1. Extract only explicitly named locations falling into the above categories.
2. Present them in alphabetical order.
3. If a location appears multiple times in the tweet with different casing (even if it's just one character different or difference case), include each variation. If the casing is identical, include it only once.
4. Include nested locations separately (e.g., for "Central Park in New York City", list both "Central Park" and "New York City").
5. Include abbreviated forms of locations (e.g., "NYC") and list them where they would appear alphabetically.
6. For locations with qualifiers, include the full phrase (e.g., "Northern France", "Downtown Chicago").
7. Treat multi-word location names as a single entry (e.g., "New York City" is one entry).
8. For locations in hashtags, include the location name without the hash symbol.
9. For ambiguous terms that could be either locations or something else, use context to determine if it's a location.

Tweet: {row.text} [/INST]"""
    ], return_tensors = "pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens = 1012, use_cache = True)
    result = tokenizer.batch_decode(outputs)
    test_df.loc[row.Index, 'model_answer'] = result
    elapsed_time = time.time() - start_time
    # Update the DataFrame with the new values
    processing_times.append(elapsed_time)
    print(f"Time taken for processing row {row.Index}: {elapsed_time} seconds")


  except Exception as e:
    print(f"An error occurred: {e}")


test_df.to_csv(Config.folder_path+ "raw_test_with_preds_sorted_3_epochs.csv", index=False)
display(test_df.head())

Time taken for processing row 0: 1.2286179065704346 seconds
Time taken for processing row 1: 1.332853078842163 seconds
Time taken for processing row 2: 1.2280452251434326 seconds
Time taken for processing row 3: 1.4427740573883057 seconds
Time taken for processing row 4: 1.4521105289459229 seconds
Time taken for processing row 5: 0.8743348121643066 seconds
Time taken for processing row 6: 0.7826972007751465 seconds
Time taken for processing row 7: 1.2181124687194824 seconds
Time taken for processing row 8: 1.2630131244659424 seconds
Time taken for processing row 9: 1.2114315032958984 seconds
Time taken for processing row 10: 1.2215032577514648 seconds
Time taken for processing row 11: 1.2444753646850586 seconds
Time taken for processing row 12: 0.9272325038909912 seconds
Time taken for processing row 13: 1.2236506938934326 seconds
Time taken for processing row 14: 1.3849706649780273 seconds
Time taken for processing row 15: 0.8908610343933105 seconds
Time taken for processing row 16: 1

Unnamed: 0,tweet_id,text,model_answer
0,ID_1001154804658286592,What is happening to the infrastructure in New...,[<bos><s>[INST] Extract all location names fro...
1,ID_1001155505459486720,SOLDER MISSING IN FLOOD.. PRAY FOR EDDISON HER...,[<bos><s>[INST] Extract all location names fro...
2,ID_1001155756371136512,RT @TIME: Police searching for missing person ...,[<bos><s>[INST] Extract all location names fro...
3,ID_1001159445194399744,Flash Flood Tears Through Maryland Town For Se...,[<bos><s>[INST] Extract all location names fro...
4,ID_1001164907587538944,Ellicott City #FLOODING Pictures: Maryland Gov...,[<bos><s>[INST] Extract all location names fro...


In [None]:
def get_answer(pred):
  try:
    return pred[0].split('[/INST]')[1].split(tokenizer.eos_token)[0].strip()
  except:
    print(pred)
    return pred.split('[/INST]')[1].split(tokenizer.eos_token)[0].strip()
model_answer = test_df['model_answer'][0]
get_answer(model_answer)


'Maryland New England New Orleans'

In [None]:

test_df['answer'] = test_df['model_answer'].apply(get_answer)
display(test_df.head())

Unnamed: 0,tweet_id,text,model_answer,answer
0,ID_1001154804658286592,What is happening to the infrastructure in New...,[<bos><s>[INST] Extract all location names fro...,Maryland New England New Orleans
1,ID_1001155505459486720,SOLDER MISSING IN FLOOD.. PRAY FOR EDDISON HER...,[<bos><s>[INST] Extract all location names fro...,Ellicott City MARYLAND
2,ID_1001155756371136512,RT @TIME: Police searching for missing person ...,[<bos><s>[INST] Extract all location names fro...,Ellicott City Maryland
3,ID_1001159445194399744,Flash Flood Tears Through Maryland Town For Se...,[<bos><s>[INST] Extract all location names fro...,Ellicott City Maryland
4,ID_1001164907587538944,Ellicott City #FLOODING Pictures: Maryland Gov...,[<bos><s>[INST] Extract all location names fro...,Ellicott City Maryland


In [None]:
import numpy as np
gpu_sub = test_df[['tweet_id', 'answer']]
gpu_sub['answer'] = np.where(gpu_sub['answer'] == '', ' ', gpu_sub['answer'])
gpu_sub['answer'] = gpu_sub['answer'].str.replace(', ', ' ')
gpu_sub.to_csv("ft_gemma_zindi_v1_sorted_dirty_test_1.csv", index=False)
gpu_sub.head()

Unnamed: 0,tweet_id,answer
0,ID_1001154804658286592,Maryland New England New Orleans
1,ID_1001155505459486720,Ellicott City MARYLAND
2,ID_1001155756371136512,Ellicott City Maryland
3,ID_1001159445194399744,Ellicott City Maryland
4,ID_1001164907587538944,Ellicott City Maryland


In [None]:
print(gpu_sub.isnull().sum())

tweet_id    0
answer      0
dtype: int64


In [None]:
test_df['model_answer'][2]

['<bos><s>[INST] Extract all location names from the given tweet and list them alphabetically, separating each location with a single space., following these guidelines:\n\nInclude:\n- City/town,\n- Country,\n- State,\n- Human-made Point-of-Interest,\n- Neighborhood,\n- Island,\n- County,\n- Continent,\n- Road/street,\n- District,\n- Natural Point-of-Interest\n- Other administrative locations\n- Geo-points (e.g., Eiffel Tower, Statue of Liberty, Mount Kilimanjaro)\n- Geo-lines (e.g., Mississippi River, Andes Mountains, Great Wall of China)\n- Geo-areas (e.g., New York City, France, Amazon Rainforest, Mediterranean Sea)\n\nExclude:\n- Addresses (e.g., 123 Main St)\n- Relative location expressions (e.g., "nearby", "downtown")\n- Generic location terms (e.g., "park", "river" without a specific name)\n\nAdditional rules:\n1. Extract only explicitly named locations falling into the above categories.\n2. Present them in alphabetical order.\n3. If a location appears multiple times in the twee