In [1]:
import transformers
import torch
import huggingface_hub

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import snapshot_download
# Set your local model destination path
local_model_path = "./models/phi-2"

In [3]:
# Download all model files into the folder
model_path = snapshot_download(
    repo_id="microsoft/phi-2",
    local_dir=local_model_path,
    local_dir_use_symlinks=False,  # not needed anymore, but safe to include
    ignore_patterns=["*.msgpack"]  # Optional: skip unwanted files
)

print(f"Model downloaded to: {model_path}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 17 files:   0%|          | 0/17 [00:00<?, ?it/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

CODE_OF_CONDUCT.md:   0%|          | 0.00/444 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.80k [00:00<?, ?B/s]

SECURITY.md:   0%|          | 0.00/2.66k [00:00<?, ?B/s]

NOTICE.md:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Model downloaded to: /content/models/phi-2


In [4]:
# Load and test tokenizer/model from local path
tokenizer = AutoTokenizer.from_pretrained(local_model_path)
model = AutoModelForCausalLM.from_pretrained(local_model_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# Optional test
inputs = tokenizer("Hello, how are you today?", return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Hello, how are you today?



In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

In [2]:
# Load CSV (Sentiment140)
df = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding='latin-1', header=None)
df = df[[0, 5]]  # Keep only sentiment and text
df.columns = ["label", "text"]

# Map: 0 = negative, 2 = neutral, 4 = positive
df = df[df['label'].isin([0, 2, 4])]
df['label'] = df['label'].map({0: 0, 2: 1, 4: 2})

# Optional: Reduce size for speed
df = df.sample(n=50000, random_state=42).reset_index(drop=True)

df.head()

Unnamed: 0,label,text
0,0,@chrishasboobs AHHH I HOPE YOUR OK!!!
1,0,"@misstoriblack cool , i have no tweet apps fo..."
2,0,@TiannaChaos i know just family drama. its la...
3,0,School email won't open and I have geography ...
4,0,upper airways problem


In [3]:
# Train-validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.1
)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize text
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

    def __len__(self):
        return len(self.labels)

# Wrap datasets
train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)


In [5]:
# Load pre-trained BERT for classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

# Define training args
training_args = TrainingArguments(
    output_dir="./results",                 # Where to save checkpoints
    fp16=True,                              # Enable mixed precision
    num_train_epochs=2,                     # Number of epochs
    per_device_train_batch_size=16,         # Training batch size
    per_device_eval_batch_size=32,          # Eval batch size
    eval_strategy="epoch",            # Evaluate every epoch
    save_strategy="epoch",                  # Save every epoch
    logging_dir="./logs",                   # Directory for logs
    logging_steps=10,                       # Log every 10 steps
    logging_strategy="steps",               # Ensure logging is step-based
    report_to="none",                       # Prevent WandB logging (optional)
    load_best_model_at_end=True,            # Reload best model at the end
    metric_for_best_model="eval_loss",      # Use eval loss to determine best
    greater_is_better=False                 # Lower eval loss is better
)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


Epoch,Training Loss,Validation Loss
1,0.3186,0.378193
2,0.1636,0.421247


TrainOutput(global_step=5626, training_loss=0.3346876466015756, metrics={'train_runtime': 762.9412, 'train_samples_per_second': 117.965, 'train_steps_per_second': 7.374, 'total_flos': 5920051898880000.0, 'train_loss': 0.3346876466015756, 'epoch': 2.0})

In [7]:
save_path = "model/bert_sentiment_model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model saved to {save_path}")


Model saved to model/bert_sentiment_model


In [12]:
!pip install -q streamlit

In [18]:
!npm install localtunnel

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K
up to date, audited 23 packages in 967ms
[1G[0K⠼[1G[0K
[1G[0K⠼[1G[0K3 packages are looking for funding
[1G[0K⠼[1G[0K  run `npm fund` for details
[1G[0K⠼[1G[0K
2 [31m[1mhigh[22m[39m severity vulnerabilities

To address all issues (including breaking changes), run:
  npm audit fix --force

Run `npm audit` for details.
[1G[0K⠼[1G[0K

In [20]:
!pip install pyttsx3

Collecting pyttsx3
  Downloading pyttsx3-2.98-py3-none-any.whl.metadata (3.8 kB)
Downloading pyttsx3-2.98-py3-none-any.whl (34 kB)
Installing collected packages: pyttsx3
Successfully installed pyttsx3-2.98


In [None]:
!streamlit run app.py & npx localtunnel --port 8501 & curl ipv4.icanhazip.com

34.125.105.106

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.105.106:8501[0m
[0m
your url is: https://nine-jeans-beam.loca.lt
2025-05-03 18:56:14.821496: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746298574.848720   39665 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746298574.857225   39665 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
🔍 Loading Phi-2 mode