In [1]:
import pandas as pd
import string
import os

from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama import ChatOllama

with open('./YoutubeComments/yt-comments.txt', 'r', encoding='utf-8') as file:
      lines = file.readlines()
      lines = [line.strip() for line in lines if line.strip()]
      df = pd.DataFrame(lines, columns=['text'])
      
# For testing purpose:
sentence1 = '''Thank you so much man, this is awesome content''' # False
sentence2 = '''Can you make another video like this.''' # True
sentence3 = '''Can I ask you this, make another video like this!''' # True
sentence4 = '''What a beautiful video.''' # False
sentence5 = '''What were you saying at 5:13 mark''' # True 
sentence6 = '''Where can I find more material like this''' # True
sentence7 = '''I don't get when you said this''' # True
sentence8 = '''What were you saying at 5:13 mark''' # True 
sentence9 = '''I don't understand this part that says''' # True
sentence10 = '''I really don't get why people are saying bad things about this video''' # True


def is_clean_english(text):
    if not isinstance(text, str):
        return False
    allowed_chars = set(string.ascii_letters + string.digits + ' .,!?\'\"')
    return all(ord(char) < 128 and char in allowed_chars for char in text)

def filter_dataframe(df, columns_to_check=None):
    df_copy = df.copy()
    
    if columns_to_check is None:
        columns_to_check = df.select_dtypes(include=['object']).columns
    
    stats = {
        'total_rows': len(df),
        'rows_removed': 0,
        'removed_examples': [],
        'issues_found': {col: 0 for col in columns_to_check}
    }
    
    keep_mask = pd.Series(True, index=df.index)
    
    for col in columns_to_check:
        if col not in df.columns:
            continue
            
        # Update mask to keep only clean English text
        col_mask = df[col].apply(is_clean_english)
        keep_mask &= col_mask
        
        # Update statistics
        stats['issues_found'][col] = (~col_mask).sum()
        
        # Collect some examples of removed text
        if len(stats['removed_examples']) < 5:
            examples = df[~col_mask][col].head()
            stats['removed_examples'].extend(
                f"Column '{col}': {text}" for text in examples
            )
    
    # Apply the filter
    df_filtered = df_copy[keep_mask]
    
    # Update final statistics
    stats['rows_removed'] = len(df) - len(df_filtered)
    stats['rows_remaining'] = len(df_filtered)
    stats['removal_percentage'] = (stats['rows_removed'] / stats['total_rows'] * 100)
    
    return df_filtered, stats

#Do not try to explain or include information other than true or false in your response under any circumstance, just simply provide true or false answer.
def is_question_by_llm(sentence):
    llm = ChatOllama(model="llama3.1:latest", temperature=0)
    prompt = PromptTemplate.from_template(
      """
      <s> [INST]
      Return true if given sentence is a question or inquiry. Return false otherwise.
      If a sentences contains '?', it's very likely to be a question, but a sentence is not a quesiton if contains '!'.
      Consider sentences that include phrases like "I don't understand..." or "I am confused..." also as questions.
      If unsure, return false.
      [/INST] </s> 
      [INST]
      Sentence: {sentence} 
      Answer: 
      [/INST]
      """
    )
    
    
    # prompt = PromptTemplate.from_template(
    #   """
    #   <s> [INST] 
    #   Return true if given sentence is a question or inquiry or a statement that induces response. This includes statement that expresses confusion or ambiguity. Return false otherwise.
    #   Do not try to explain, just simply provide true or false answer.
    #   [/INST] </s> 
    #   [INST]
    #   Sentence: {sentence} 
    #   Answer: 
    #   [/INST]
    #   """
    # )
    
    chain = prompt | llm
    query_chain = chain.invoke(
        {
            "sentence": sentence
        }
    )
    return query_chain.content



In [2]:
# # shuffle
# shuffled_df = df.sample(frac=1).reset_index(drop=True)

# # filter out symbols, and grab first n = 100 data points
# result_df, stats = filter_dataframe(shuffled_df.head(100))

# # remove processed 100 data points from df
# merged_df = shuffled_df.merge(result_df, on=['text'], how='left', indicator=True)
# reduced_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns='_merge')
# df = reduced_df.copy()

# # Apply the question detection function
# result_df['label'] = result_df['text'].apply(is_question_by_llm)


# # Clean the response into 1 or 0
# result_df['label'] = result_df['label'].apply(lambda x: 1 if str(x).strip().lower().replace('.','') == 'true' else 0)
# file_exists = os.path.isfile('output.csv')
# result_df.to_csv('output.csv', 
#                  mode='a',  # append mode
#                  header=not file_exists,  # only write header if file doesn't exist
#                  index=False)

In [3]:
result_df, stats = filter_dataframe(df.head(10))

# Apply the question detection function
result_df['label'] = result_df['text'].apply(is_question_by_llm)

# Clean the response into 1 or 0
result_df['label'] = result_df['label'].apply(lambda x: 1 if str(x).strip().lower().replace('.','') == 'true' else 0)
file_exists = os.path.isfile('output.csv')
result_df.to_csv('output.csv', 
                 mode='a',  # append mode
                 header=not file_exists,  # only write header if file doesn't exist
                 index=False)

In [4]:
print(is_question_by_llm("You did that at the end on purpose!"))

Based on the instructions, I would answer:

**False**

The sentence contains an exclamation mark '!', which suggests it's a statement rather than a question. The presence of '!'' is actually a strong indicator that it's not a question, according to the instructions.
