In [None]:
#First of all, mounting google drive to access input files and save output files in google drive
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


# Preprocessing of news articles file

In [None]:
import pandas as pd
import csv
import re

#The file contained various special characters in different formats, so everything is removed except the alphabets, integers and important characters.
def clean_text(text):
    #Removing words with extra spaces around
    text = re.sub(r'\s{2,}', ' ', text)

    #Regular expression for characters that are allowed
    pattern = r'[^a-zA-Z0-9,\$\(\):\;\. ]'

    #Replace the special characters with empty string
    cleaned_text = re.sub(pattern, '', text)

    return cleaned_text

#Path of input file
input_csv_file = '/content/drive/My Drive/news_dataset.csv'

#Reading the csv file in a df
df = pd.read_csv(input_csv_file)

#Rows with missing values are removed
df.dropna(inplace=True)

# Apply the clean_text function to preprocess the 'content' column
df['content'] = df['content'].apply(clean_text)


#The column contained unnecessary words like recorded reports or authors name, so remove these by removing everything from start to the first colon (:)
df['content'] = df['content'].str.split(':', n=1).str[-1]

#Remove the text 'copyright' with year present in some articles
df['content'] = df['content'].str.replace('Copyright Business Recorder, \d{4}\.', '', regex=True)

# Define a regular expression pattern to match lines with timestamps and program titles
pattern = r'^\d{2}:\d{2}\s[A-Z ]+'

#Remove the rows with matching above pattern
df = df[~df['content'].str.match(pattern)]

#Path for output csv file
output = '/content/drive/My Drive/clean_news_data.csv'

# Open the output CSV file for writing. This piece of code is for eliminating rows with less than 10 words.
with open(output, 'w', newline='', encoding='utf-8') as output_file:
    writer = csv.writer(output_file)

    # Iterate through rows in the DataFrame
    for index, row in df.iterrows():
        # Write rows with 10 or more words in the 'content' column to the output file
        if len(row['content'].split()) >= 10:
            writer.writerow([row['content']])

print("Preprocessed data saved to:", output)


# Preprocessing of reviews_only file



In [None]:
import pandas as pd
import re

# Function to clean the text
def clean_text(text):
    #Remove everything except alphabets, integers, commas, and full stops.
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s.,]', '', text)
    # Removing extra full stops, question marks and commas and replacing them with single full stop, question mark and comma
    cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text)
    cleaned_text = re.sub(r'\?{2,}', '?', cleaned_text)
    cleaned_text = re.sub(r'\,{2,}', ',' , cleaned_text)
    return cleaned_text.strip()

#Path of input csv file
file_path = '/content/drive/MyDrive/reviews_only.csv'

#Read csv file in df
df = pd.read_csv(file_path)

# Remove empty rows
df.dropna(inplace=True)

# Remove rows with less than 10 words
df['Word Count'] = df['Review Text'].apply(lambda x: len(x.split()))
df = df[df['Word Count'] >= 10]

# Clean the review text
df['Cleaned Review'] = df['Review Text'].apply(clean_text)

# Drop the 'Word Count' column as it's no longer needed
df.drop(columns=['Word Count'], inplace=True)

# Remove duplicate rows based on the 'Cleaned Review' column
df.drop_duplicates(subset=['Cleaned Review'], inplace=True)

# extract the cleaned review text
cleaned_reviews = df['Cleaned Review']
print(cleaned_reviews)

# Save the cleaned reviews to a separate CSV file
cleaned_reviews.to_csv('/content/drive/MyDrive/cleaned_reviews.csv', index=False)


0       1. Exterior and interior both are out class.2....
1       eliability is a real strong point for the Coro...
2       Grande is the best car  its fuel economy is be...
3       Toyota Corolla 1982 GLexterior of this car is ...
4       Exterior of car is awesome.Pick up and engine ...
                              ...                        
9527    Hello guys, my experience with Honda CG 125 ha...
9528    this bike best but please launch ybr 125 Custo...
9529    The look and feel of the the bike is good, the...
9530    First of all look is good decent and stylish b...
9531    Lorem Ipsum is simply dummy text of the printi...
Name: Cleaned Review, Length: 4877, dtype: object


# Joining both files

In [None]:
import pandas as pd

# Load both files into DataFrames
file1_path = '/content/drive/MyDrive/clean_news_data_.csv'
file2_path = '/content/drive/MyDrive/cleaned_reviews.csv'
news_df = pd.read_csv(file1_path)
reviews_df = pd.read_csv(file2_path)

# Concatenate both df
merged_df = pd.concat([news_df, reviews_df], ignore_index=True)

# Save the concatenated df to a new file
output = '/content/drive/My Drive/complete_file.csv'
merged_df.to_csv(output, index=False)

print("Merged file saved to:", output)


# LLM

In [None]:
#installing necessary libraries
!pip install transformers



In [None]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/171.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-no

In [None]:
!pip install ctransformers

Collecting ctransformers
  Downloading ctransformers-0.2.27-py3-none-any.whl (9.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ctransformers
Successfully installed ctransformers-0.2.27


In [None]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m7.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
!pip install langchain

Collecting langchain
  Downloading langchain-0.1.16-py3-none-any.whl (817 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/817.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/817.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m440.3/817.7 kB[0m [31m6.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m788.5/817.7 kB[0m [31m7.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.

In [None]:
#The Model used for this task is TheBloke/Mistral-7B-Instruct-v0.1-GGUF.

In [None]:
import csv
from langchain.llms import CTransformers
from langchain.chains import QAGenerationChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import RetrievalQA
import os
import json
import time
from PyPDF2 import PdfReader

#Loading Language model
def load_llm():
    # Load the locally downloaded model here
    llm = CTransformers(
        model = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
        model_type="mistral",
        max_new_tokens = 1048,
        temperature = 0.3
    )
    return llm

#File Processing
def file_processing(file_path):

    # Load data from CSV
    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile)
        data = ""
        for row in reader:
            data += row[0]

    question_gen = data

    splitter_ques_gen = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 50
    )

    chunks_ques_gen = splitter_ques_gen.split_text(question_gen)

    document_ques_gen = [Document(page_content=t) for t in chunks_ques_gen]

    splitter_ans_gen = RecursiveCharacterTextSplitter(
        chunk_size = 300,
        chunk_overlap = 30
    )

    document_answer_gen = splitter_ans_gen.split_documents(
        document_ques_gen
    )

    return document_ques_gen, document_answer_gen

#Prompt Engineering
def llm_pipeline(file_path):

    document_ques_gen, document_answer_gen = file_processing(file_path)

    llm_ques_gen_pipeline = load_llm()

    prompt_template = """
    You are an expert at creating questions based on text data.
    Your goal is to prepare students for test and exams.
    You do this by asking questions about the text below:

    ------------
    {text}
    ------------

    Create questions that will help customers know about the automobile.
    Make sure not to lose any important information.

    QUESTIONS:
    """

    PROMPT_QUESTIONS = PromptTemplate(template=prompt_template, input_variables=["text"])

    refine_template = ("""
    You are an expert at creating practice questions based on text data.
    Your goal is to prepare the student for exam and test.
    We have received some practice questions to a certain extent: {existing_answer}.
    We have the option to refine the existing questions or add new ones.
    (only if necessary) with some more context below.
    ------------
    {text}
    ------------

    Given the new context, refine the original questions in English.
    If the context is not helpful, please provide the original questions.
    QUESTIONS:
    """
    )

    REFINE_PROMPT_QUESTIONS = PromptTemplate(
        input_variables=["existing_answer", "text"],
        template=refine_template,
    )

    ques_gen_chain = load_summarize_chain(llm = llm_ques_gen_pipeline,
                                            chain_type = "refine",
                                            verbose = True,
                                            question_prompt=PROMPT_QUESTIONS,
                                            refine_prompt=REFINE_PROMPT_QUESTIONS)

    ques = ques_gen_chain.run(document_ques_gen)

    embeddings = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

    vector_store = FAISS.from_documents(document_answer_gen, embeddings)

    llm_answer_gen = load_llm()

    ques_list = ques.split("\n")
    filtered_ques_list = [element for element in ques_list if element.endswith('?') or element.endswith('.')]

    answer_generation_chain = RetrievalQA.from_chain_type(llm=llm_answer_gen,
                                                chain_type="stuff",
                                                retriever=vector_store.as_retriever())

    return answer_generation_chain, filtered_ques_list

#Generating CSV file in the drive
def get_csv (file_path):
    answer_generation_chain, ques_list = llm_pipeline(file_path)
    base_folder = 'output/'
    if not os.path.isdir(base_folder):
        os.mkdir(base_folder)
    output_file = base_folder+"QA.csv"
    with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["Question", "Answer"])

        for question in ques_list:
            print("Question: ", question)
            answer = answer_generation_chain.run(question)
            print("Answer: ", answer)
            print("--------------------------------------------------\n\n")

            # Save answer to CSV file
            csv_writer.writerow([question, answer])
    return output_file


csv_file_path = "/content/drive/MyDrive/final.csv"
output_file = get_csv(csv_file_path)
print("CSV file generated:", output_file)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

mistral-7b-instruct-v0.1.Q2_K.gguf:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

  warn_deprecated(




[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
    You are an expert at creating questions based on text data.
    Your goal is to prepare students for test and exams.
    You do this by asking questions about the text below:

    ------------
    New modern look , parking sensors, rear camera, 12v power soket, USB, touch multimedia screen, keyless entry, stylish speedometer, amazing and unexpected AC performance, 1518 fuel economy, good grip on road, smooth power steering only leg space is compromisable otherwise no better option in this price braketGood condition  my favorite car is low range  Suzuki liana I am full satisfied customer and I am a very thankful to Suzuki and i am recommend buy Suzuki liana for family and enjoy your trip
    ------------

    Create questions that will help customers know about the automobile.
    Make sure not to lose any important information.

    QUESTIONS:
   




[1m> Finished chain.[0m

[1m> Finished chain.[0m


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Question:  1) What are the modern features of the Suzuki Liana?
Answer:   The Suzuki Liana comes with a range of modern features such as New modern look, parking sensors, rear camera, 12v power socket, USB, touch multimedia screen, keyless entry, stylish speedometer, amazing and unexpected AC performance, good grip on road, smooth power steering.
--------------------------------------------------


Question:      2) Does the Suzuki Liana have parking sensors and a rear camera?
Answer:   Yes, the Suzuki Liana comes with parking sensors and a rear camera.
--------------------------------------------------


Question:      3) Can the Suzuki Liana run on 12v power sockets and has USB ports?
Answer:      Yes, the Suzuki Liana has 12v power sockets and USB ports.
--------------------------------------------------


Question:      4) What is the fuel economy of the Suzuki Liana?
Answer:       1518
--------------------------------------------------


CSV file generated: output/QA.csv
