In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import joblib


In [3]:
df = pd.read_csv('dataset/relabeled_sentences.csv',  delimiter=',', quoting=2, encoding="utf-8")

In [5]:
df.head(5)

Unnamed: 0,id,tweets,labels,new_labels
0,0.0,ChatGPT: Optimizing Language Models for Dialog...,neutral,0.0
1,1.0,"Try talking with ChatGPT, our new AI system wh...",good,0.136364
2,2.0,ChatGPT: Optimizing Language Models for Dialog...,neutral,0.0
3,3.0,"THRILLED to share that ChatGPT, our new model ...",good,0.302273
4,4.0,"As of 2 minutes ago, @OpenAI released their ne...",bad,0.211039


In [7]:
df.drop(columns='labels')

Unnamed: 0,id,tweets,new_labels
0,0.0,ChatGPT: Optimizing Language Models for Dialog...,0.000000
1,1.0,"Try talking with ChatGPT, our new AI system wh...",0.136364
2,2.0,ChatGPT: Optimizing Language Models for Dialog...,0.000000
3,3.0,"THRILLED to share that ChatGPT, our new model ...",0.302273
4,4.0,"As of 2 minutes ago, @OpenAI released their ne...",0.211039
...,...,...,...
219289,219289.0,Other Software Projects Are Now Trying to Repl...,-0.125000
219290,219290.0,I asked #ChatGPT to write a #NYE Joke for SEOs...,0.052841
219291,219291.0,chatgpt is being disassembled until it can onl...,0.000000
219292,219292.0,2023 predictions by #chatGPT. Nothing really s...,-0.125000


In [8]:
def procces_sentence(review:str)->str:
    sample_review = re.sub(r'http\S+', '', review)
    sample_review = re.sub("[^a-zA-Z]",' ',sample_review)
    sample_review = sample_review.lower()
    sample_review = sample_review.split()
    swords = set(stopwords.words("english"))                     
    # sample_review = [w for w in sample_review if w not in swords]        
    sample_review = [w for w in sample_review if w not in swords and len(w) > 1]
    sample_review = " ".join(sample_review)
    return sample_review

def take_res(count:int)->int:
    labels = df["labels"][count]
    if labels == "bad": return 0
    elif labels == "neutral": return 1
    elif labels == "good": return 2


In [9]:
from tqdm import tqdm

count_t = 219288

trainx = []
trainy = []
for i in tqdm(range(219200)):
    trainx.append(procces_sentence(df["tweets"][i]))
    trainy.append(take_res(i))



100%|██████████| 219200/219200 [00:38<00:00, 5728.00it/s]


In [10]:
trainy = np.array(trainy)
trainy
train_x, test_x, y_train, y_test = train_test_split(trainx,trainy, test_size = 0.1)
vectorizer = CountVectorizer( max_features = 40000 )
train_x = vectorizer.fit_transform(train_x)



In [11]:
test_xx = vectorizer.transform(test_x)
test_xx
test_xx = test_xx.toarray()

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

model = LinearRegression()
model.fit(train_x, y_train)

In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

# Tahminler
y_pred = model.predict(test_xx)

# Metrikleri hesaplayalım
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)  # MSE'nin karekökü
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")
print(f"Mean Absolute Percentage Error (MAPE): {mape}")


Mean Absolute Error (MAE): 0.38401412461331763
Mean Squared Error (MSE): 0.27395079826629937
Root Mean Squared Error (RMSE): 0.5234030934817824
R-squared (R²): 0.6027700046665514
Mean Absolute Percentage Error (MAPE): 711650190803296.2




In [23]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
import xgboost as xgb
from catboost import CatBoostRegressor, CatBoostClassifier
import pandas as pd
import numpy as np

# Modelleri ekleyelim
model_r = {
    # Regresyon Modelleri

    # 'Random Forest Regressor': RandomForestRegressor(),
    # 'Extra Trees Regressor': ExtraTreesRegressor(),
    'AdaBoost Regressor': AdaBoostRegressor(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'K Neighbors Regressor': KNeighborsRegressor(),
    'CatBoost Regressor': CatBoostRegressor(learning_rate=0.1, depth=6, iterations=500, verbose=0),
    

}

# Sonuçları ayrı ayrı kaydedelim
results_regression = {}
results_classification = {}

# Regresyon Modellerini Çalıştır
for name, model in model_r.items():
    model.fit(train_x, y_train)
    y_pred = model.predict(test_xx)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    results_regression[name] = {'RMSE': rmse}
    print(f'{name}: RMSE = {rmse}')



# Sonuçları DataFrame'e Çevir
results_regression_df = pd.DataFrame(results_regression).T
results_classification_df = pd.DataFrame(results_classification).T

# Sonuçları yazdır
print("Regression Results:")
print(results_regression_df)
print("\nClassification Results:")
print(results_classification_df)


AdaBoost Regressor: RMSE = 0.807108359700914
Decision Tree Regressor: RMSE = 0.5112562770162078
K Neighbors Regressor: RMSE = 0.8050898757463026
CatBoost Regressor: RMSE = 0.5131619185651
Regression Results:
                             RMSE
AdaBoost Regressor       0.807108
Decision Tree Regressor  0.511256
K Neighbors Regressor    0.805090
CatBoost Regressor       0.513162

Classification Results:
Empty DataFrame
Columns: []
Index: []


In [25]:
import joblib

# Modeli ve sonuçları kaydedeceğimiz dosya isimleri
results_file = 'results_regression.txt'

# Modelleri ve sonuçları dosyalara kaydedelim
with open(results_file, 'w') as file:
    for name, model in model_r.items():
        # Modeli yeniden eğitelim
        model.fit(train_x, y_train)
        
        # Tahmin yapalım
        y_pred = model.predict(test_xx)
        
        # RMSE'yi hesaplayalım
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        # Sonuçları kaydet
        results_regression[name] = {'RMSE': rmse}
        print(f'{name}: RMSE = {rmse}')
        
        # Modeli kaydet
        model_filename = f'model/{name.replace(" ", "_").lower()}_regressor.pkl'
        joblib.dump(model, model_filename)
        
        # Sonuçları txt dosyasına yaz
        file.write(f'{name}: RMSE = {rmse}\n')

# Sonuçları bir DataFrame olarak da kaydedelim
results_regression_df = pd.DataFrame(results_regression).T
results_regression_df.to_csv("results_regression.csv", index=True)

# Yazdırma
print("\nRegression Results:")
print(results_regression_df)


KeyboardInterrupt: 

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

def predict_sentiment(new_sentence, model, vectorizer):
    # Clean and preprocess the sentence
    cleaned_sentence = procces_sentence(new_sentence)
    # Transform it using the trained vectorizer
    new_sentence_vector = vectorizer.transform([cleaned_sentence])
    new_sentence_vector = new_sentence_vector.toarray()
    
    # Get the probability for each class
    prediction_probabilities = model.predict(new_sentence_vector)[0]
    
    
    
    return prediction_probabilities

# Example usage:
new_sentence = "this is a suicide and you are a criminal"
predicted_sentiment = predict_sentiment(new_sentence, model, vectorizer)
print(f"Sentiment probabilities for the input: {predicted_sentiment}")


Sentiment probabilities for the input: -0.5178470748209099
