In [2]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
import torch
import numpy as np

# 영/한 댓글 분류

## - filepath 변경

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, udf
from pyspark.sql.types import BooleanType, StringType
from langid.langid import LanguageIdentifier, model


spark = SparkSession.builder.appName("UdemyCommentsAnalysis").getOrCreate()
#df = spark.read.format("csv") \
#                       .option("header", "true") \
#                       .option("inferSchema", "true") \
#                       .option("encoding","UTF-8") \
#                       .load("merged_comments_30rows.csv")

file_path = "file:///home/hyunjin6/Documents/workspace/merged_comments.csv"

reviews_df = spark.read.csv(file_path, header=True, inferSchema=True)

reviews_df = reviews_df.withColumn('comment', lower(col('comment')))
reviews_df = reviews_df.withColumn('comment', regexp_replace(col('comment'), '[^\w\s]', ''))

identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)

def detect_language_langid(comment):
    try:
        lang, _ = identifier.classify(comment)
        return lang
    except:
        return "Unknown"

detect_language_udf = udf(detect_language_langid, StringType())

reviews_df = reviews_df.withColumn("language", detect_language_udf(col("comment")))

filtered_comments_df = reviews_df.filter((col("language") == "en") | (col("language") == "ko"))


print(filtered_comments_df)

                                                                                

DataFrame[course_id: string, comment: string, language: string]


In [4]:
filtered_comments_df.show()

[Stage 2:>                                                          (0 + 1) / 1]

+---------+--------------------+--------+
|course_id|             comment|language|
+---------+--------------------+--------+
|  3173036|i think a beginne...|      en|
|  4913148|aviva is such a n...|      en|
|  3175814|this course is th...|      en|
|  3174896|i found this cour...|      en|
|  4693438|nothing informati...|      en|
|  4693272|multiple spelling...|      en|
|  3168632|very unique way o...|      en|
|  3188362|                    |      en|
|  4164550|good course  info...|      en|
|  4164836|thanks kate great...|      en|
|  4693624|halfway thru very...|      en|
|  4695130|its a pretty good...|      en|
|  4694990|it was very nice ...|      en|
|  4165910|it is the best co...|      en|
|  4695172|i have watched tw...|      en|
|  4694460|all about radio a...|      en|
|  4163248|should have provi...|      en|
|  3175482|amazing course an...|      en|
|  3157018|                    |      en|
|  3152462|excellent present...|      en|
+---------+--------------------+--

                                                                                

In [None]:
korean_comments_df = filtered_comments_df.filter(col("language") == "ko")
korean_comments_df.show()


In [None]:
korean_comments_count = korean_comments_df.count()
print(f"Number of comments in Korean: {korean_comments_count}")

In [None]:
filtered_comments_df = filtered_comments_df.limit(5000)
filtered_comments_df.show()

In [None]:
filtered_comments_df.repartition(1).write.csv("file:///home/hyunjin6/Documents/workspace/comments_enko.csv", header=True, mode="overwrite")


In [10]:
#test

test_comments = ["안녕하세요", "Hello", "Bonjour", "こんにちは"]
for comment in test_comments:
    print(f"Comment: {comment} -> Language: {detect_language_langid(comment)}")



Comment: 안녕하세요 -> Language: ko
Comment: Hello -> Language: en
Comment: Bonjour -> Language: en
Comment: こんにちは -> Language: ja


# 영어 댓글 긍/부정 분석

In [11]:
def preprocess_data(comments, tokenizer, max_len):
    inputs = tokenizer.batch_encode_plus(
        comments,
        add_special_tokens=True,
        max_length=max_len,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    return inputs

def classify_sentiment(comments):
    model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(model_name)
    
    max_len = 64
    inputs = preprocess_data(comments, tokenizer, max_len)

    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    dataset = TensorDataset(input_ids, attention_mask)
    dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=32)

    model.eval()
    sentiments = []

    for batch in dataloader:
        batch = tuple(t.to('cpu') for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs[0]
            probs = torch.softmax(logits, dim=1)
            sentiments.append(probs.cpu().numpy())

    sentiments = np.concatenate(sentiments, axis=0)
    sentiments = np.argmax(sentiments, axis=1)

    return sentiments

In [12]:
comments = english_comments_df.select('comment').rdd.flatMap(lambda x: x).collect()
sentiments = classify_sentiment(comments)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [13]:
english_comments_pd = english_comments_df.toPandas()
english_comments_pd['sentiment'] = sentiments
english_comments_pd['sentiment_label'] = english_comments_pd['sentiment'].apply(lambda x: 'positive' if x > 2 else 'negative')

                                                                                

In [14]:
positive_en_comments_pd = english_comments_pd[english_comments_pd['sentiment_label'] == 'positive']
negative_en_comments_pd = english_comments_pd[english_comments_pd['sentiment_label'] == 'negative']

In [15]:
merged_en_comments_pd = pd.concat([positive_en_comments_pd, negative_en_comments_pd])

#print(merged_en_comments_pd)

In [None]:
merged_en_comments_pd.to_csv('en_comments_p_n.csv', index=False)

In [16]:
print(merged_en_comments_pd)

   course_id                                            comment language  \
1    4913148  aviva is such a natural teacher and healerheal...       en   
2    3175814  this course is the best on udemy  this breakda...       en   
3    3174896  i found this course very helpful it was full o...       en   
6    3168632  very unique way of teaching simple but powerfu...       en   
7    3188362                                                          en   
8    4164550  good course  information is well organized cle...       en   
9    4164836  thanks kate great course and valuable informat...       en   
10   4693624            halfway thru very good course thank you       en   
11   4695130  its a pretty good course ive managed to create...       en   
12   4694990  it was very nice explanation many thanks for s...       en   
13   4165910  it is the best course i have seen on these top...       en   
14   4695172               i have watched two lesson it is good       en   
17   3175482

In [None]:
spark.stop()