In [None]:
!pip install pyngrok

In [None]:
# GPT-Neo Server Code (Kaggle Notebook 2)
from fastapi import FastAPI, HTTPException, Request, Response
import uvicorn
from pyngrok import ngrok, conf
import threading
import time
import torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
import msgpack

# Ngrok config
NGROK_AUTHTOKEN = "YOUR_NGROK_AUTHTOKEN"
conf.get_default().auth_token = NGROK_AUTHTOKEN

# Load GPT-Neo model
print("Loading GPT-Neo...")
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B", torch_dtype=torch.float16)
tokenizer.pad_token = tokenizer.eos_token

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()
print(f"GPT-Neo model loaded on {device}")

app = FastAPI()

@app.get("/")
def read_root():
    return {"message": "GPT-Neo Server is running!", "model": "gpt-neo", "version": "v2.0"}

@app.post("/inference")
async def inference(request: Request):
    """Accept msgpack data directly from request body"""
    try:
        body = await request.body()
        print(f"Received body length: {len(body)}")
        
        request_data = msgpack.unpackb(body, raw=False)
        text = request_data['text']
        do_generate = request_data.get('do_generate', False)
        
        print(f"Processing text: {text[:50]}...")
        
        # Tokenize with shorter max_length to avoid OOM
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = inputs['input_ids'][..., 1:].contiguous()
            
            loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
            losses = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            losses = losses.view(shift_labels.shape)
            
            token_losses = losses[0].cpu().numpy().tolist()
            begin_word_idx = 1
            ll_tokens = [-loss for loss in token_losses]
        
        response_tuple = (token_losses, begin_word_idx, ll_tokens)
        packed_response = msgpack.packb(response_tuple)
        
        print(f"Response prepared successfully")
        
        return Response(
            content=packed_response, 
            media_type="application/octet-stream"
        )
        
    except Exception as e:
        print(f"Error in inference: {e}")
        import traceback
        traceback.print_exc()
        raise HTTPException(status_code=500, detail=str(e))

def run_app():
    uvicorn.run(app, host="0.0.0.0", port=8000)

# Start server
ngrok.kill()
server_thread = threading.Thread(target=run_app)
server_thread.start()
time.sleep(5)

public_url = ngrok.connect(8000)
print(f"\n{'='*50}")
print(f"GPT-Neo Server URL: {public_url}")
print(f"{'='*50}")

while True:
    time.sleep(60)