In [4]:
import pandas as pd
import random

# Load dataset
df = pd.read_csv("translated_file.csv", encoding="latin1")

synthetic_data = []

for i in range(1000):  # generate 1000 synthetic rows
    row = df.sample(1).to_dict(orient="records")[0]

    # --- Modify numeric risk ratings (ensure conversion to int) ---
    for col in ["14. Operational Risk", "14. Technical risks"]:
        if col in row:
            try:
                val = int(row[col])
                row[col] = min(5, max(1, val + random.choice([-1, 0, 1])))
            except:
                pass

    # --- Randomize categorical answers ---
    risk_options = [
        "Financial risk", "Operational risk", "Strategic risk",
        "Technical risk", "Compliance risk"
    ]
    risk_col = "Which risk has had the most significant impact on your business over the past year?"
    for c in row.keys():
        if c.strip() == risk_col:
            row[c] = random.choice(risk_options)

    # --- Slight variation in time/points ---
    if "Time taken" in row:
        try:
            val = int(row["Time taken"])
            row["Time taken"] = max(30, val + random.randint(-30, 30))
        except:
            pass

    if "Total points" in row:
        try:
            val = int(row["Total points"])
            row["Total points"] = max(1, val + random.randint(-2, 2))
        except:
            pass

    synthetic_data.append(row)

# Convert synthetic data to DataFrame
synthetic_df = pd.DataFrame(synthetic_data)

# Combine with original
augmented_df = pd.concat([df, synthetic_df], ignore_index=True)

# Save
augmented_df.to_csv("augmented_survey_data.csv", index=False, encoding="utf-8")

print("✅ Expanded dataset saved as augmented_survey_data.csv")
print("Original rows:", len(df))
print("Synthetic rows:", len(synthetic_df))
print("Final rows:", len(augmented_df))


✅ Expanded dataset saved as augmented_survey_data.csv
Original rows: 999
Synthetic rows: 1000
Final rows: 1999


In [6]:
df = pd.read_csv("augmented_survey_data.csv")
pd.set_option("display.max_rows", None)
df

Unnamed: 0,Serial number,Submit the answer time,Time taken,source,Source details,From IP,Total points,1. What is your software company name? (Optional),2. How big is your organization?,3. What type of software development does your company focus on? (Multiple choice) (Web page development),...,22. How effective do you think your current risk management strategy is?,23. What is the biggest challenge your organization faces in managing risks?,24. Do you understand risk management solutions based on artificial intelligence?,25. Are you willing to use artificial intelligence-based tools for risk assessment and mitigation?,26. Which functions do you think are the most useful in a risk management system based on artificial intelligence? (Multiple choice) (Automatic risk detection),26. (Forecast risk),26. (Chamber bots that provide real-time support),26. (Risk classification and priority sorting),26. (Integrated with existing business tools),27. Are you willing to participate in the AI-driven risk management pilot project?
0,1.0,2025/5/22 12:55:56,317 seconds,WeChat,,114.246.237.122 (Beijing-Beijing),12.0,"Beijing Lingxun Technology Co., Ltd.",1.0,1.0,...,3.0,Demand boundary control is not clear enough,2.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0
1,2.0,2025/5/22 13:05:19,400 seconds,WeChat,,221.198.235.195 (Tianjin-Tianjin),16.0,(null),1.0,1.0,...,2.0,efficiency,2.0,2.0,0.0,0.0,0.0,1.0,1.0,1.0
2,3.0,2025/5/22 15:27:05,206 seconds,WeChat,,120.244.235.165 (Beijing-Beijing),16.0,(null),1.0,1.0,...,1.0,High communication cost,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
3,4.0,2025/5/22 15:41:35,410 seconds,WeChat,,114.246.255.29 (Beijing-Beijing),11.0,(null),1.0,1.0,...,2.0,Strategic risk,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0
4,5.0,2025/5/28 22:45:56,152 seconds,WeChat,,61.48.36.96 (Beijing-Beijing),18.0,Autohome,3.0,1.0,...,2.0,Personnel risk,1.0,2.0,1.0,0.0,1.0,0.0,1.0,1.0
5,6.0,2025/5/28 22:46:16,122 seconds,WeChat,,120.244.232.231 (Beijing-Beijing),8.0,(null),1.0,0.0,...,4.0,Technology iterates too fast,2.0,2.0,0.0,1.0,0.0,0.0,1.0,2.0
6,7.0,2025/5/29 9:35:10,326 seconds,WeChat,,221.216.117.220 (Beijing-Beijing),14.0,Jinjing,1.0,0.0,...,3.0,Risk assessment is inaccurate,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0
7,8.0,2025/5/29 11:51:41,151 seconds,WeChat,,42.100.4.6 (Heilongjiang-Harbin),4.0,(null),1.0,1.0,...,1.0,No,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
8,9.0,2025/5/29 11:54:33,561 seconds,WeChat,,120.244.232.231 (Beijing-Beijing),15.0,(null),1.0,1.0,...,4.0,Can't effectively avoid,2.0,2.0,1.0,1.0,0.0,1.0,1.0,1.0
9,10.0,2025/6/3 19:35:23,426 seconds,WeChat,,222.131.241.79 (Beijing-Beijing),15.0,"Guangxi Qishi Technology Co., Ltd.",1.0,0.0,...,3.0,Stability of core developers,2.0,2.0,0.0,0.0,0.0,0.0,1.0,2.0


In [7]:
import pandas as pd
import math

# Load dataset
df = pd.read_csv("translated_file.csv", encoding="latin1")

# How many rows needed to reach 2000
target_rows = 2000
multiplier = math.ceil(target_rows / len(df))

# Repeat rows
expanded_df = pd.concat([df] * multiplier, ignore_index=True)

# Trim to exactly 2000 rows
expanded_df = expanded_df.iloc[:target_rows]

# Save
expanded_df.to_csv("expanded_survey_data.csv", index=False, encoding="utf-8")

print("✅ Expanded dataset saved as expanded_survey_data.csv")
print("Original rows:", len(df))
print("Final rows:", len(expanded_df))


✅ Expanded dataset saved as expanded_survey_data.csv
Original rows: 999
Final rows: 2000


In [8]:
df = pd.read_csv("expanded_survey_data.csv")
pd.set_option("display.max_rows", None)
df

Unnamed: 0,Serial number,Submit the answer time,Time taken,source,Source details,From IP,Total points,1. What is your software company name? (Optional),2. How big is your organization?,3. What type of software development does your company focus on? (Multiple choice) (Web page development),...,22. How effective do you think your current risk management strategy is?,23. What is the biggest challenge your organization faces in managing risks?,24. Do you understand risk management solutions based on artificial intelligence?,25. Are you willing to use artificial intelligence-based tools for risk assessment and mitigation?,26. Which functions do you think are the most useful in a risk management system based on artificial intelligence? (Multiple choice) (Automatic risk detection),26. (Forecast risk),26. (Chamber bots that provide real-time support),26. (Risk classification and priority sorting),26. (Integrated with existing business tools),27. Are you willing to participate in the AI-driven risk management pilot project?
0,1.0,2025/5/22 12:55:56,317 seconds,WeChat,,114.246.237.122 (Beijing-Beijing),12.0,"Beijing Lingxun Technology Co., Ltd.",1.0,1.0,...,3.0,Demand boundary control is not clear enough,2.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0
1,2.0,2025/5/22 13:05:19,400 seconds,WeChat,,221.198.235.195 (Tianjin-Tianjin),16.0,(null),1.0,1.0,...,2.0,efficiency,2.0,2.0,0.0,0.0,0.0,1.0,1.0,1.0
2,3.0,2025/5/22 15:27:05,206 seconds,WeChat,,120.244.235.165 (Beijing-Beijing),16.0,(null),1.0,1.0,...,1.0,High communication cost,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
3,4.0,2025/5/22 15:41:35,410 seconds,WeChat,,114.246.255.29 (Beijing-Beijing),11.0,(null),1.0,1.0,...,2.0,Strategic risk,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0
4,5.0,2025/5/28 22:45:56,152 seconds,WeChat,,61.48.36.96 (Beijing-Beijing),18.0,Autohome,3.0,1.0,...,2.0,Personnel risk,1.0,2.0,1.0,0.0,1.0,0.0,1.0,1.0
5,6.0,2025/5/28 22:46:16,122 seconds,WeChat,,120.244.232.231 (Beijing-Beijing),8.0,(null),1.0,0.0,...,4.0,Technology iterates too fast,2.0,2.0,0.0,1.0,0.0,0.0,1.0,2.0
6,7.0,2025/5/29 9:35:10,326 seconds,WeChat,,221.216.117.220 (Beijing-Beijing),14.0,Jinjing,1.0,0.0,...,3.0,Risk assessment is inaccurate,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0
7,8.0,2025/5/29 11:51:41,151 seconds,WeChat,,42.100.4.6 (Heilongjiang-Harbin),4.0,(null),1.0,1.0,...,1.0,No,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
8,9.0,2025/5/29 11:54:33,561 seconds,WeChat,,120.244.232.231 (Beijing-Beijing),15.0,(null),1.0,1.0,...,4.0,Can't effectively avoid,2.0,2.0,1.0,1.0,0.0,1.0,1.0,1.0
9,10.0,2025/6/3 19:35:23,426 seconds,WeChat,,222.131.241.79 (Beijing-Beijing),15.0,"Guangxi Qishi Technology Co., Ltd.",1.0,0.0,...,3.0,Stability of core developers,2.0,2.0,0.0,0.0,0.0,0.0,1.0,2.0


In [11]:
df = pd.read_csv("translated_file.csv", encoding="latin1")
# pd.set_option("display.max_rows", None)
df_20 = df.head(20)

# Save to new file (or overwrite the old one)
df_20.to_csv("translated_file_20.csv", index=False, encoding="utf-8")

print("✅ Saved only first 20 rows as translated_file_20.csv")
print("Final rows:", len(df_20))

✅ Saved only first 20 rows as translated_file_20.csv
Final rows: 20


In [12]:
df = pd.read_csv("translated_file_20.csv", encoding="latin1")
pd.set_option("display.max_rows", None)
df

Unnamed: 0,Serial number,Submit the answer time,Time taken,source,Source details,From IP,Total points,1. What is your software company name? (Optional),2. How big is your organization?,3. What type of software development does your company focus on? (Multiple choice) (Web page development),...,22. How effective do you think your current risk management strategy is?,23. What is the biggest challenge your organization faces in managing risks?,24. Do you understand risk management solutions based on artificial intelligence?,25. Are you willing to use artificial intelligence-based tools for risk assessment and mitigation?,26. Which functions do you think are the most useful in a risk management system based on artificial intelligence? (Multiple choice) (Automatic risk detection),26. (Forecast risk),26. (Chamber bots that provide real-time support),26. (Risk classification and priority sorting),26. (Integrated with existing business tools),27. Are you willing to participate in the AI-driven risk management pilot project?
0,1.0,2025/5/22 12:55:56,317 seconds,WeChat,,114.246.237.122 (Beijing-Beijing),12.0,"Beijing Lingxun Technology Co., Ltd.",1.0,1.0,...,3.0,Demand boundary control is not clear enough,2.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0
1,2.0,2025/5/22 13:05:19,400 seconds,WeChat,,221.198.235.195 (Tianjin-Tianjin),16.0,(null),1.0,1.0,...,2.0,efficiency,2.0,2.0,0.0,0.0,0.0,1.0,1.0,1.0
2,3.0,2025/5/22 15:27:05,206 seconds,WeChat,,120.244.235.165 (Beijing-Beijing),16.0,(null),1.0,1.0,...,1.0,High communication cost,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
3,4.0,2025/5/22 15:41:35,410 seconds,WeChat,,114.246.255.29 (Beijing-Beijing),11.0,(null),1.0,1.0,...,2.0,Strategic risk,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0
4,5.0,2025/5/28 22:45:56,152 seconds,WeChat,,61.48.36.96 (Beijing-Beijing),18.0,Autohome,3.0,1.0,...,2.0,Personnel risk,1.0,2.0,1.0,0.0,1.0,0.0,1.0,1.0
5,6.0,2025/5/28 22:46:16,122 seconds,WeChat,,120.244.232.231 (Beijing-Beijing),8.0,(null),1.0,0.0,...,4.0,Technology iterates too fast,2.0,2.0,0.0,1.0,0.0,0.0,1.0,2.0
6,7.0,2025/5/29 9:35:10,326 seconds,WeChat,,221.216.117.220 (Beijing-Beijing),14.0,Jinjing,1.0,0.0,...,3.0,Risk assessment is inaccurate,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0
7,8.0,2025/5/29 11:51:41,151 seconds,WeChat,,42.100.4.6 (Heilongjiang-Harbin),4.0,(null),1.0,1.0,...,1.0,No,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
8,9.0,2025/5/29 11:54:33,561 seconds,WeChat,,120.244.232.231 (Beijing-Beijing),15.0,(null),1.0,1.0,...,4.0,Can't effectively avoid,2.0,2.0,1.0,1.0,0.0,1.0,1.0,1.0
9,10.0,2025/6/3 19:35:23,426 seconds,WeChat,,222.131.241.79 (Beijing-Beijing),15.0,"Guangxi Qishi Technology Co., Ltd.",1.0,0.0,...,3.0,Stability of core developers,2.0,2.0,0.0,0.0,0.0,0.0,1.0,2.0


In [14]:
import pandas as pd
import math

# Load dataset
df = pd.read_csv("translated_file_20.csv", encoding="latin1")
df = df.drop(columns=["Submit the answer time", "Serial number"], errors="ignore")


target_rows = 3000
multiplier = math.ceil(target_rows / len(df))

expanded_df = pd.concat([df] * multiplier, ignore_index=True)

expanded_df = expanded_df.iloc[:target_rows]


# Save
expanded_df.to_csv("expanded_survey_data.csv", index=False, encoding="utf-8")

print("Expanded dataset saved as expanded_survey_data.csv")
print("Original rows:", len(df))
print("Final rows:", len(expanded_df))


Expanded dataset saved as expanded_survey_data.csv
Original rows: 20
Final rows: 3000


In [16]:
df1 = pd.read_csv("translated_file_new2.csv", encoding="latin1")
pd.set_option("display.max_rows", None)
df1

Unnamed: 0,Serial number,Submit the answer time,Time taken,source,Source details,From IP,Total points,1. What is your software company name? (Optional),2. How big is your organization?,3. What type of software development does your company focus on? (Multiple choice),...,18. How severe is compliance risks (such as legal and regulatory issues) affecting your organization on the scale of 1 to 5?,19. What are the three major risks that your company prioritizes mitigation? (Select up to 3 options),20. How does your organization evaluate and classify risks?,21. What risk management strategies does your company currently adopt? (Multiple choice),22. How effective do you think your current risk management strategy is?,23. What is the biggest challenge your organization faces in managing risks?,24. Do you understand risk management solutions based on artificial intelligence?,25. Are you willing to use artificial intelligence-based tools for risk assessment and mitigation?,26. Which functions do you think are the most useful in a risk management system based on artificial intelligence? (Multiple choice),27. Are you willing to participate in the AI-driven risk management pilot project?
0,1,2025/5/22 12:55:56,317 seconds,WeChat,,114.246.237.122 (Beijing-Beijing),12,"Beijing Lingxun Technology Co., Ltd.",Small (1-50 employees),Web page development?Mobile application develo...,...,generally,"Strategic risks (e.g., market competition, fai...",Manual assessment of management,Regular risk audit ?Employee risk management t...,generally,Demand boundary control is not clear enough,no,"If it proves valid, maybe",Automatic risk detection,yes
1,2,2025/5/22 13:05:19,400 seconds,WeChat,,221.198.235.195 (Tianjin-Tianjin),16,(null),Small (1-50 employees),Web page development?Mobile application develo...,...,generally,"Financial risks (such as budget overspending, ...",Manual assessment of management,Regular risk audits?Employee risk management t...,More effective,efficiency,no,"If it proves valid, maybe",Risk classification and prioritization ?Integr...,yes
2,3,2025/5/22 15:27:05,206 seconds,WeChat,,120.244.235.165 (Beijing-Beijing),16,(null),Small (1-50 employees),Web page development?Mobile application develo...,...,generally,"Financial risks (such as budget overspending, ...",Manual assessment of management,Regular risk audits?Cybersecurity policies and...,Very effective,High communication cost,yes,"Yes, we are interested",Automatic risk detection ? Predicting risk? Ch...,yes
3,4,2025/5/22 15:41:35,410 seconds,WeChat,,114.246.255.29 (Beijing-Beijing),11,(null),Small (1-50 employees),Web page development?Mobile application develo...,...,No impact,"Strategic risks (e.g., market competition, fai...",Manual assessment of management,Regular risk audits?Emergency fund reserves?Cy...,More effective,Strategic risk,no,"If it proves valid, maybe",Automatic risk detection ? Predicting risk? Ch...,yes
4,5,2025/5/28 22:45:56,152 seconds,WeChat,,61.48.36.96 (Beijing-Beijing),18,Autohome,Large (more than 200 employees),Web page development?Mobile application develo...,...,Significant impact,"Financial risks (such as budget overspending, ...",Data-driven analysis,Regular risk audit ?Employee risk management t...,More effective,Personnel risk,yes,"If it proves valid, maybe",Automatic risk detection? Chatbots that provid...,yes
5,6,2025/5/28 22:46:16,122 seconds,WeChat,,120.244.232.231 (Beijing-Beijing),8,(null),Small (1-50 employees),Game development,...,Dissatisfied,"Operational risks (such as inefficiency, proje...",Data-driven analysis,Regular risk audits?Cybersecurity policies and...,Not very effective,Technology iterates too fast,no,"If it proves valid, maybe",Predicting risk?Integrate with existing busine...,no
6,7,2025/5/29 9:35:10,326 seconds,WeChat,,221.216.117.220 (Beijing-Beijing),14,Jinjing,Small (1-50 employees),Artificial Intelligence/Machine Learning Solut...,...,generally,"Financial risks (such as budget overspending, ...",Manual assessment of management,Regular risk audit ?Employee risk management t...,generally,Risk assessment is inaccurate,no,"If it proves valid, maybe",Predicted Risk?Risk Classification and Priorit...,yes
7,8,2025/5/29 11:51:41,151 seconds,WeChat,,42.100.4.6 (Heilongjiang-Harbin),4,(null),Small (1-50 employees),Web page development?Mobile application develo...,...,No impact,"Financial risks (such as budget overspending, ...",Data-driven analysis,Regular risk audits?Employee risk management t...,Very effective,No,yes,"Yes, we are interested",Automatic risk detection ? Predicting risk? Ch...,no
8,9,2025/5/29 11:54:33,561 seconds,WeChat,,120.244.232.231 (Beijing-Beijing),15,(null),Small (1-50 employees),Web page development?Mobile application develo...,...,generally,"Strategic risks (such as market competition, f...",No formal risk assessment process,No formal risk management system,Not very effective,Can't effectively avoid,no,"If it proves valid, maybe",Automatic risk detection ?Predicted risk ?Risk...,yes
9,10,2025/6/3 19:35:23,426 seconds,WeChat,,222.131.241.79 (Beijing-Beijing),15,"Guangxi Qishi Technology Co., Ltd.",Small (1-50 employees),Artificial Intelligence/Machine Learning Solut...,...,generally,"Financial risks (such as budget overspending, ...",Manual assessment of management,Employee Risk Management Training?Emergency Fu...,generally,Stability of core developers,no,"If it proves valid, maybe",Integrate with existing business tools,no


In [17]:
import pandas as pd
import math

# Load dataset
df = pd.read_csv("translated_file_new2.csv", encoding="latin1")
df = df.drop(columns=["Submit the answer time", "Serial number"], errors="ignore")


target_rows = 3000
multiplier = math.ceil(target_rows / len(df))

expanded_df = pd.concat([df] * multiplier, ignore_index=True)

expanded_df = expanded_df.iloc[:target_rows]


# Save
expanded_df.to_csv("expanded_survey_data2.csv", index=False, encoding="utf-8")

print("Expanded dataset saved as expanded_survey_data.csv")
print("Original rows:", len(df))
print("Final rows:", len(expanded_df))


Expanded dataset saved as expanded_survey_data.csv
Original rows: 20
Final rows: 3000


In [20]:
# Load both datasets
df1 = pd.read_csv("expanded_survey_data.csv", encoding="utf-8")
df2 = pd.read_csv("expanded_survey_data2.csv", encoding="utf-8")

# Reset index to make sure rows line up correctly
df1 = df1.reset_index(drop=True)
df2 = df2.reset_index(drop=True)

data_for_model = pd.concat([df1, df2], axis=1)

# Save combined dataset
data_for_model.to_csv("combined_dataset.csv", index=False, encoding="utf-8")

print("Combined dataset saved as combined_dataset.csv")
print("Rows:", len(data_for_model))
print("Columns:", len(data_for_model.columns))


Combined dataset saved as combined_dataset.csv
Rows: 3000
Columns: 111
