In [1]:
! pip install backoff



In [2]:
import gradio as gr
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from gpt_index import SimpleDirectoryReader, GPTListIndex, GPTSimpleVectorIndex, LLMPredictor, PromptHelper, ServiceContext
from langchain import OpenAI
import sys
import os
from gtts import gTTS

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
os.environ["OPENAI_API_KEY"] = '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'

In [14]:
# Load and concatenate datasets
dataset_paths = ['dataset.csv', 'Language Detection.csv']
dataframes = [pd.read_csv(path) for path in dataset_paths]
df = pd.concat(dataframes)

# Convert language labels to integers
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Language'])

# Load the DistilBert tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

In [15]:
# Load the DistilBertForSequenceClassification model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=30)

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'pre_classif

In [16]:
# Load the saved model
path = "model_best50.pt"
model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [17]:
from openai.error import RateLimitError
import backoff

@backoff.on_exception(backoff.expo, RateLimitError)
def completions_with_backoff(**kwargs):
    response = openai.Completion.create(**kwargs)
    return response

In [18]:
def construct_index(directory_path):
    max_input_size = 4096
    num_outputs = 512
    max_chunk_overlap = 20
    chunk_size_limit = 600

    prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)

    llm_predictor = LLMPredictor(llm=OpenAI(temperature=0.7, model_name="text-davinci-003", max_tokens=num_outputs))

    documents = SimpleDirectoryReader(directory_path).load_data()
    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)

    index = GPTSimpleVectorIndex.from_documents(documents=documents, service_context=service_context)
    index.save_to_disk('chat.json')

    return index

In [19]:
index = construct_index("Docs")
index.load_from_disk('chat.json')

INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 8973260 tokens


<gpt_index.indices.vector_store.vector_indices.GPTSimpleVectorIndex at 0x2565b2f75b0>

In [20]:
lang_dict = {"English" : "en",
             "Spanish" : "es",
             "German" : "de",
             "French" : "fr",
             "Italian" : "it",
             "Portuguese" : "pt",
             "Dutch" : "nl",
             "Russian" : "ru",
             "Arabic" : "ar",
             "Japanese" : "ja",
             "Korean" : "ko",
             "Chinese" : "zh-CN"
            }  

In [21]:
def chatbot(text):
# Check if input text contains the word "language"
  if 'what language is this:' in text.lower():
      # Use the language detection model to detect the language
      text = text.split("what language is this:")[1].strip()
      encoding = tokenizer.encode_plus(text, padding=True, truncation=True, return_tensors='pt')
      input_ids = encoding['input_ids']
      attention_mask = encoding['attention_mask']

      with torch.no_grad():
          output = model(input_ids, attention_mask=attention_mask)
          probabilities = torch.nn.functional.softmax(output[0], dim=1)
          _, predicted = torch.max(output[0], dim=1)
          predicted_language = label_encoder.inverse_transform(predicted.cpu().numpy())

      # Return the detected language
      response = f"The detected language is {predicted_language[0]}"
      language = lang_dict[predicted_language[0]]
      
      #Getting to speech
      myobj = gTTS(text=response, lang='en', slow=False) 
      myobj.save("test.wav")
      myobj2 = gTTS(text=text, lang=language, slow=False) 
      myobj2.save("test2.wav") 
      return 'test.wav', 'test2.wav'
  else:
      # Use the GPT-based chatbot to generate a response
      response = index.query(text, response_mode="compact").response
      
      # Predict language of the output
      encoding = tokenizer.encode_plus(response, padding=True, truncation=True, return_tensors='pt')
      input_ids = encoding['input_ids']
      attention_mask = encoding['attention_mask']

      with torch.no_grad():
          output = model(input_ids, attention_mask=attention_mask)
          probabilities = torch.nn.functional.softmax(output[0], dim=1)
          _, predicted = torch.max(output[0], dim=1)
          predicted_language = label_encoder.inverse_transform(predicted.cpu().numpy())
          
      language = lang_dict[predicted_language[0]]
      
      #Getting to speech
      myobj = gTTS(text=response, lang=language, slow=False) 
      myobj.save("test.wav")
      myobj2 = gTTS(text=text, lang=language, slow=False) 
      myobj2.save("test2.wav") 
      return 'test.wav', 'test2.wav'
  

In [None]:
title = "Chatbot for Languages"
description = "This app can detect the language of your input text and generate a response. It includes up to 30 languages such as English, Spanish, French, German, Italian, Portuguese, Dutch, Russian, Arabic, Chinese, Japanese,Korean, and many more."
examples = ["what language is this: How are you",
            "what language is this: Comment allez-vous",
            "what language is this: Cómo estás",
            "what language is this: Wie geht es dir",
            "what language is this: 어떻게 지내세요",
            "what language is this: どうもありがとうございます",
            "what language is this: Hoe is het",
            "what language is this: Come stai",
            "what language is this: 你好吗",
            "what language is this: 元気ですか",
            "what language is this: كيف حالك" 
            ]


iface = gr.Interface(fn = chatbot,
                     inputs = gr.Textbox(label="Please type text here"),
                     outputs = [gr.Audio(label="Output speech"), gr.Audio(label="Reading input text")], 
                     #verbose = True,
                     title = title,
                     description = description,
                     article = 
                        '''<div>
                            <p style="text-align: center"> All you need to do is to type a text in any of the listed language. Then click on Play/Pause to hear the name of the language. The audio is saved in a wav format.</p>
                        </div>''',
                     examples=examples
                    )
iface.launch(share=True)

Running on local URL:  http://127.0.0.1:7864
Running on public URL: https://9337b92f13d65a2029.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces




INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 3798 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 5 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 3746 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 6 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 3919 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 5 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 3931 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 5 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 3883 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 4 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 3781 tokens
INFO:gpt_index.to