In [5]:
pip install openai



In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import os

os.chdir("/Project_X")

# Load the CSV file
df = pd.read_csv('tweets.csv')

# Combine all tweets into a single text corpus
corpus = ' '.join(df['Tweet Text'])

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['Tweet Text'])

# Perform KMeans clustering to find topics
num_clusters = 5
km = KMeans(n_clusters=num_clusters)
km.fit(X)

# Get the top terms for each cluster
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

# Print the top terms for each cluster
for i in range(num_clusters):
    print(f"Cluster {i}:")
    for ind in order_centroids[i, :10]:
        print(f" {terms[ind]}")
    print("\n")

# Assign clusters to tweets
df['Cluster'] = km.labels_

# Display the DataFrame with clusters
print(df[['User Name', 'Tweet Text', 'Cluster']])


Cluster 0:
 mexico
 sewage
 epa
 administrator
 san
 diego
 crisis
 https
 just
 tijuana


Cluster 1:
 clean
 public
 federal
 fighting
 health
 demanding
 protect
 action
 beaches
 government


Cluster 2:
 president
 work
 wants
 amp
 health
 https
 crisis
 mexico
 border
 tijuana


Cluster 3:
 need
 ve
 river
 ground
 shovels
 fix
 https
 federal
 crisis
 tijuana


Cluster 4:
 gallons
 million
 tijuana
 pumps
 sewage
 flowing
 billion
 https
 toxic
 day


                              User Name  \
0                      Wall Street Apes   
1                Supervisor Jim Desmond   
2                      Wall Street Apes   
3                            Mike Levin   
4                        Laura Ingraham   
5                        Susan Crabtree   
6                          Tyler O'Neil   
7                Supervisor Jim Desmond   
8                          Amy Reichert   
9                            Mike Levin   
10            Assemblyman Jeff Gonzalez   
11                    

In [14]:
import openai

# Combine tweets in each cluster into a single text
clustered_tweets = df.groupby('Cluster')['Tweet Text'].apply(lambda x: ' '.join(x)).reset_index()

# Set up OpenAI API key
openai.api_key = 'your_api_key_here'
from openai import OpenAI

client = OpenAI(api_key=openai.api_key) # Initialize the OpenAI client

# Function to summarize text using OpenAI GPT-3
def summarize_text(text):
    # Updated to use client.chat.completions.create
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",  # Or another suitable model
        messages=[
            {"role": "system", "content": "You are a helpful assistant that summarizes text."},
            {"role": "user", "content": f"Summarize the following text:\n\n{text}"}
        ],
        max_tokens=150
    )
    # Extract the summary from the response
    return response.choices[0].message.content.strip()

# Apply summarization to each cluster
clustered_tweets['Summary'] = clustered_tweets['Tweet Text'].apply(summarize_text)

# Display the summaries
print(clustered_tweets[['Cluster', 'Summary']])

   Cluster                                            Summary
0        0  Mexico has been sending tens of millions of il...
1        1  Leaders, including @epaleezeldin, are fighting...
2        2  The text discusses the Tijuana Sewage Crisis a...
3        3  The text discusses the Tijuana River sewage cr...
4        4  The text discusses the ongoing sewage crisis i...


In [16]:
clustered_tweets.to_csv('clustered_tweets.csv')